-
Notifications
You must be signed in to change notification settings - Fork 57
/
iterative.c
1641 lines (1566 loc) · 66.9 KB
/
iterative.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/* A few iterative techniques to solve DDA equations
*
* The linear system is composed so that diagonal terms are equal to 1, therefore use of Jacobi preconditioners does not
* have any effect.
*
* CS methods still converge to the right result even when matrix is slightly non-symmetric (e.g. -int so), however they
* do it much slowly than usually. It is recommended then to use BiCGStab or BCGS2.
*
* Copyright (C) ADDA contributors
* This file is part of ADDA.
*
* ADDA is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
*
* ADDA is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty
* of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with ADDA. If not, see
* <http://www.gnu.org/licenses/>.
*/
#include "const.h" // keep this first
// project headers
#include "cmplx.h"
#include "comm.h"
#include "debug.h"
#include "io.h"
#include "linalg.h"
#include "memory.h"
#include "timing.h"
#include "vars.h"
// system headers
#include <math.h>
#include <stdlib.h>
#include <string.h>
#include <time.h> // for time_t & time
#ifdef OCL_BLAS
# include "oclcore.h"
# include <clBLAS.h> //external library
# include <clBLAS.version.h>
#endif
// SEMI-GLOBAL VARIABLES
// defined and initialized in calculator.c
extern doublecomplex *rvec; // can't be declared restrict due to SwapPointers
extern doublecomplex * restrict vec1,* restrict vec2,* restrict vec3,* restrict vec4,* restrict Avecbuffer;
// defined and initialized in fft.c
#if !defined(OPENCL) && !defined(SPARSE)
extern doublecomplex * restrict Xmatrix; // used as storage for arrays in WKB init field
#endif
// defined and initialized in param.c
extern const double iter_eps;
extern const enum init_field InitField;
extern const char *infi_fnameY,*infi_fnameX;
extern const bool recalc_resid;
extern const enum chpoint chp_type;
extern const time_t chp_time;
extern const char *chp_dir;
// defined and initialized in timing.c
extern time_t last_chp_wt;
extern TIME_TYPE Timing_OneIter,Timing_OneIterComm,Timing_InitIter,Timing_InitIterComm,Timing_IntFieldOneComm,
Timing_MVP,Timing_MVPComm,Timing_OneIterMVP,Timing_OneIterMVPComm;
extern size_t TotalIter;
// LOCAL VARIABLES
#define RESID_STRING "RE_%03d = "EFORM // string containing residual value
#define FFORM_PROG "% .6f" // format for progress value
static double inprodR; // used as |r_0|^2 and best squared norm of residual up to some iteration
static double inprodRp1; // used as |r_k+1|^2 and squared norm of current residual
static double epsB; // stopping criterion
static double resid_scale; // scale to get square of relative error
static double prev_err; // previous relative error; used in ProgressReport, initialized in IterativeSolver
static int ind_m; // index of iterative method
static int niter; // iteration count
static int counter; // number of successive iterations without residual decrease
static bool chp_exit; // checkpoint occurred - exit
static bool complete; // complete iteration was performed (not stopped in the middle)
// whether matrix-vector product computed during initialization can be reused at first iteration
static bool matvec_ready;
typedef struct // data for checkpoints
{
void *ptr; // pointer to the data
int size; // size of one element
} chp_data;
chp_data * restrict scalars,* restrict vectors;
enum phase {
PHASE_VARS, // Initialization of variables, and linking them to scalars and vectors
PHASE_INIT, // Initialization of iterations
PHASE_ITER // Each iteration
};
struct iter_params_struct {
enum iter meth; // identifier
int mc; // maximum allowed number of iterations without residual decrease
int sc_N; // number of additional scalars to describe the state
int vec_N; // number of additional vectors to describe the state
void (*func)(const enum phase); // pointer to implementation of the iterative solver
};
#if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1100) && (__INTEL_COMPILER < 1200)
# define WORKAROUND146 // workaround for issue 146 (should not be relevant to modern compilers)
static doublecomplex dumb;
#endif
#define ITER_FUNC(name) static void name(const enum phase ph)
ITER_FUNC(BCGS2);
ITER_FUNC(BiCG_CS);
ITER_FUNC(BiCGStab);
ITER_FUNC(CGNR);
ITER_FUNC(CSYM);
ITER_FUNC(QMR_CS);
ITER_FUNC(QMR_CS_2);
/* TO ADD NEW ITERATIVE SOLVER
* Add the line to this list in the alphabetical order, analogous to the ones already present. The variable part is the
* name of the function, implementing the method. The macros expands to a function prototype.
*/
static const struct iter_params_struct params[]={
{IT_BCGS2,15000,2,1,BCGS2},
{IT_BICG_CS,50000,1,0,BiCG_CS},
{IT_BICGSTAB,30000,3,3,BiCGStab},
{IT_CGNR,10,1,0,CGNR},
{IT_CSYM,10,6,2,CSYM},
{IT_QMR_CS,50000,8,3,QMR_CS},
{IT_QMR_CS_2,50000,5,2,QMR_CS_2}
/* TO ADD NEW ITERATIVE SOLVER
* Add its parameters to this list in the alphabetical order. The parameters, in order of appearance, are identifier
* (specified in const.h), maximum allowed number of iterations without the residual decrease, numbers of additional
* scalars and vectors to describe the state of the iterative solver (see comment before function SaveCheckpoint),
* and name of a function, implementing the method.
*/
};
// EXTERNAL FUNCTIONS
// matvec.c
void MatVec(doublecomplex * restrict in,doublecomplex * restrict out,double * inprod,bool her,TIME_TYPE *timing,
TIME_TYPE *comm_timing);
#ifdef OCL_BLAS
// Test clBLAS version (specific numbers is because we never considered earlier versions)
# define CLBLAS_VER_REQ 2
# define CLBLAS_SUBVER_REQ 12
# if !GREATER_EQ2(clblasVersionMajor,clblasVersionMinor,CLBLAS_VER_REQ,CLBLAS_SUBVER_REQ)
# error "clBLAS version is too old"
# endif
// Error-checking functionality for clBLAS
# define CLBLAS_CH_ERR(a) Check_clBLAS_Err(a,ALL_POS)
//======================================================================================================================
static const char *Print_clBLAS_Errstring(clblasStatus err)
// produces meaningful error message from the clBLAS-specific error code, based on clBLAS.h v.2.12.0 (NULL if not found)
{
switch (err) {
case clblasNotImplemented: return "Functionality not implemented";
case clblasNotInitialized: return "Library not initialized";
case clblasInvalidMatA: return "Invalid matrix A";
case clblasInvalidMatB: return "Invalid matrix B";
case clblasInvalidMatC: return "Invalid matrix C";
case clblasInvalidVecX: return "Invalid vector X";
case clblasInvalidVecY: return "Invalid vector Y";
case clblasInvalidDim: return "Invalid input dimensions";
case clblasInvalidLeadDimA: return "Leading dimension of matrix A smaller than the first size";
case clblasInvalidLeadDimB: return "Leading dimension of matrix B smaller than the second size";
case clblasInvalidLeadDimC: return "Leading dimension of matrix C smaller than the third size";
case clblasInvalidIncX: return "Null increment for vector X";
case clblasInvalidIncY: return "Null increment for vector Y";
case clblasInsufficientMemMatA: return "Insufficient memory for matrix A";
case clblasInsufficientMemMatB: return "Insufficient memory for matrix B";
case clblasInsufficientMemMatC: return "Insufficient memory for matrix C";
case clblasInsufficientMemVecX: return "Insufficient memory for vector X";
case clblasInsufficientMemVecY: return "Insufficient memory for vector Y";
default: return NULL;
}
}
//======================================================================================================================
static void Check_clBLAS_Err(const clblasStatus err,ERR_LOC_DECL)
/* Checks error code for clBLAS calls and prints error if necessary. First searches among clBLAS specific errors. If not
* found, uses general error processing for CL calls (since clBLAS error codes can take standard cl values as well).
*/
{
if (err != clblasSuccess) {
const char *str=Print_clBLAS_Errstring(err);
if (str!=NULL) LogError(ERR_LOC_CALL,"clBLAS error code %d: %s\n",err,str);
else PrintCLErr((cl_int)err,ERR_LOC_CALL,NULL);
}
}
#endif
//======================================================================================================================
static void MatVec_wrapper(doublecomplex * restrict in,doublecomplex * restrict out,double * inprod,bool her,
TIME_TYPE *timing,TIME_TYPE *comm_timing)
/* function wrapper for MatVec to be called within the iterative solver if the solver is able to use clBLAS, i.e.
* the host and GPU memory does not have to be synchronized. Currently it is only used in the BiCG solver.
*/
{
#ifdef OCL_BLAS
bufupload=false;
#endif
MatVec(in,out,inprod,her,timing,comm_timing);
#ifdef OCL_BLAS
bufupload=true;
#endif
}
//======================================================================================================================
static inline void SwapPointers(doublecomplex **a,doublecomplex **b)
/* swap two pointers of (doublecomplex *) type; should work for others but will give "Suspicious pointer conversion"
* warning. While this is a convenient function that can save some copying between memory blocks, it doesn't allow
* consistent usage of 'restrict' keyword for affected pointers. This may hamper some optimizations. Hopefully, the most
* important optimizations are those in the linalg.c, which can be improved by using 'restrict' keyword in the functions
* themselves.
*/
{
doublecomplex *tmp;
tmp=*a;
*a=*b;
*b=tmp;
}
//======================================================================================================================
/* Checkpoint systems saves the current state of the iterative solver to the file. By default (for every iterative
* solver) a number of scalars and vectors are saved. The scalars include, among others, inprodR. There are 3 default
* vectors: xvec, rvec, pvec (Avecbuffer is _not_ saved). If the iterative solver requires any other scalars or vectors
* to describe its state, this information should be specified in structure arrays 'scalars' and 'vectors'.
*/
static void SaveIterChpoint(void)
/* save a binary checkpoint; only limitedly foolproof - user should take care to load checkpoints on the same machine
* (number of processors) and with the same command line.
*/
{
int i;
char fname[MAX_FNAME];
FILE * restrict chp_file;
TIME_TYPE tstart;
tstart=GET_TIME();
if (IFROOT) {
// create directory "chp_dir" if needed and open info file
SnprintfErr(ONE_POS,fname,MAX_FNAME,"%s/"F_CHP_LOG,chp_dir);
if ((chp_file=fopen(fname,"w"))==NULL) {
MkDirErr(chp_dir,ONE_POS);
chp_file=FOpenErr(fname,"w",ONE_POS);
}
// write info and close file
fprintf(chp_file,"Info about the run, which produced the checkpoint, can be found in ../%s",directory);
FCloseErr(chp_file,fname,ONE_POS);
}
// wait to ensure that directory exists
Synchronize();
// open output file; writing errors are checked only for vectors
SnprintfErr(ALL_POS,fname,MAX_FNAME,"%s/"F_CHP,chp_dir,ringid);
chp_file=FOpenErr(fname,"wb",ALL_POS);
// write common scalars
fwrite(&ind_m,sizeof(int),1,chp_file);
fwrite(&local_nRows,sizeof(size_t),1,chp_file);
fwrite(&niter,sizeof(int),1,chp_file);
fwrite(&counter,sizeof(int),1,chp_file);
fwrite(&inprodR,sizeof(double),1,chp_file);
fwrite(&prev_err,sizeof(double),1,chp_file); // written on ALL processors but used only on root
fwrite(&resid_scale,sizeof(double),1,chp_file);
// write specific scalars
for (i=0;i<params[ind_m].sc_N;i++) fwrite(scalars[i].ptr,scalars[i].size,1,chp_file);
// write common vectors
if (fwrite(xvec,sizeof(doublecomplex),local_nRows,chp_file)!=local_nRows)
LogError(ALL_POS,"Failed writing to file '%s'",fname);
if (fwrite(rvec,sizeof(doublecomplex),local_nRows,chp_file)!=local_nRows)
LogError(ALL_POS,"Failed writing to file '%s'",fname);
if (fwrite(pvec,sizeof(doublecomplex),local_nRows,chp_file)!=local_nRows)
LogError(ALL_POS,"Failed writing to file '%s'",fname);
// write specific vectors
for (i=0;i<params[ind_m].vec_N;i++) if (fwrite(vectors[i].ptr,vectors[i].size,local_nRows,chp_file)!=local_nRows)
LogError(ALL_POS,"Failed writing to file '%s'",fname);
// close file
FCloseErr(chp_file,fname,ALL_POS);
// write info to logfile after everyone is finished
Synchronize();
if (IFROOT) PrintBoth(logfile,"Checkpoint (iteration) saved\n");
Timing_FileIO+=GET_TIME()-tstart;
Synchronize(); // this is to ensure that message above appears if and only if OK
}
//======================================================================================================================
static void LoadIterChpoint(void)
/* load a binary checkpoint; only limitedly foolproof - user should take care to load checkpoints on the same machine
* (number of processors) and with the same command line.
*/
{
int i;
int ind_m_new;
size_t local_nRows_new;
char fname[MAX_FNAME],ch;
FILE * restrict chp_file;
TIME_TYPE tstart;
tstart=GET_TIME();
// open input file; reading errors are checked only for vectors
SnprintfErr(ALL_POS,fname,MAX_FNAME,"%s/"F_CHP,chp_dir,ringid);
chp_file=FOpenErr(fname,"rb",ALL_POS);
/* check for consistency. This implies that the same index corresponds to the same iterative solver in list params.
* So if the ADDA executable was changed, e.g. by adding a new iterative solver, between writing and reading
* checkpoint, this test may fail.
*/
fread(&ind_m_new,sizeof(int),1,chp_file);
if (ind_m_new!=ind_m) LogError(ALL_POS,"File '%s' is for different iterative method",fname);
fread(&local_nRows_new,sizeof(size_t),1,chp_file);
if (local_nRows_new!=local_nRows) LogError(ALL_POS,"File '%s' is for different vector size",fname);
// read common scalars
fread(&niter,sizeof(int),1,chp_file);
fread(&counter,sizeof(int),1,chp_file);
fread(&inprodR,sizeof(double),1,chp_file);
fread(&prev_err,sizeof(double),1,chp_file); // read on ALL processors but used only on root
fread(&resid_scale,sizeof(double),1,chp_file);
// read specific scalars
for (i=0;i<params[ind_m].sc_N;i++) fread(scalars[i].ptr,scalars[i].size,1,chp_file);
// read common vectors
if (fread(xvec,sizeof(doublecomplex),local_nRows,chp_file)!=local_nRows)
LogError(ALL_POS,"Failed reading from file '%s'",fname);
if (fread(rvec,sizeof(doublecomplex),local_nRows,chp_file)!=local_nRows)
LogError(ALL_POS,"Failed reading from file '%s'",fname);
if (fread(pvec,sizeof(doublecomplex),local_nRows,chp_file)!=local_nRows)
LogError(ALL_POS,"Failed reading from file '%s'",fname);
// read specific vectors
for (i=0;i<params[ind_m].vec_N;i++) if (fread(vectors[i].ptr,vectors[i].size,local_nRows,chp_file)!=local_nRows)
LogError(ALL_POS,"Failed reading from file '%s'",fname);
// check if EOF reached and close file
if (fread(&ch,1,1,chp_file)!=0) LogError(ALL_POS,"File '%s' is too long",fname);
FCloseErr(chp_file,fname,ALL_POS);
// initialize auxiliary variables
epsB=iter_eps*iter_eps/resid_scale;
// print info
if (IFROOT) {
PrintBoth(logfile,"Checkpoint (iteration) loaded\n");
// if residual is stagnating print info about last minimum
if (counter!=0) fprintf(logfile,"Residual has been stagnating already for %d iterations since:\n"
RESID_STRING"\n...\n",counter,niter-counter-1,sqrt(resid_scale*inprodR));
}
Timing_FileIO+=GET_TIME()-tstart;
}
//======================================================================================================================
static void ProgressReport(void)
// Do common procedures; show progress in logfile and stdout; also check for checkpoint condition
{
double err,progr,elapsed;
char progr_string[MAX_LINE];
const char *temp;
time_t wt;
if (inprodRp1<=inprodR) {
inprodR=inprodRp1;
counter=0;
}
else counter++;
if (IFROOT) {
err=sqrt(resid_scale*inprodRp1);
progr=1-err/prev_err;
if (counter==0) temp="+ ";
else if (progr>0) temp="-+";
else temp="- ";
SnprintfErr(ONE_POS,progr_string,MAX_LINE,RESID_STRING" %s",niter,err,temp);
if (!orient_avg) fprintf(logfile,"%s progress ="FFORM_PROG"\n",progr_string,progr);
PRINTFB("%s\n",progr_string);
prev_err=err;
}
niter++;
TotalIter++;
// check condition for checkpoint; checkpoint is saved at first time
if (chp_type!=CHP_NONE && chp_time!=UNDEF && complete) {
time(&wt);
elapsed=difftime(wt,last_chp_wt);
if (chp_time<elapsed) {
SaveIterChpoint();
time(&last_chp_wt);
if (chp_type!=CHP_REGULAR) chp_exit=true;
}
}
}
//======================================================================================================================
static double ResidualNorm2(doublecomplex * restrict x,doublecomplex * restrict r,doublecomplex * restrict buffer,
TIME_TYPE *mvp_timing,TIME_TYPE *mvp_comm_timing,TIME_TYPE *comm_timing)
/* Computes ||Ax-b||^2, where b=sqrt(C).Einc; buffer is used for Ax, r contains Ax-b at the end; comm_timing is
* incremented with communication time. If only the norm is required, the calculation can be done without using vector
* r, but this does not make a lot of sense, since memory is allocated anyway.
*/
{
double res;
TIME_TYPE mc_time=0;
MatVec(x,buffer,NULL,false,mvp_timing,&mc_time);
(*mvp_comm_timing) += mc_time;
(*comm_timing) += mc_time;
nMult_mat(r,Einc,cc_sqrt);
nDecrem(r,buffer,&res,comm_timing);
return res;
}
//======================================================================================================================
ITER_FUNC(BCGS2)
/* Enhanced Bi-CGStab(2) method.
* Based on the code by M.A. Botchev and D.R. Fokkema - http://www.math.uu.nl/people/vorst/zbcg2.f90 and
* D. R. Fokkema, "Enhanced implementation of BiCGstab(l) for solving linear systems of equations," Preprint 976,
* Department of Mathematics, Utrecht University (1996).
*
* "Reliable update part" was removed, since tests using '-recalc_resid' show that it is (almost) never needed.
*
* For l=1, the method is equivalent to BiCGStab, rewritten through 2-term recurrences (as QMR2 is equivalent to QMR),
* so we use l=2 here. In many cases one iteration of this method is similar to two iterations of BiCGStab, but overall
* convergence is slightly better.
* Breakdown tests were made to coincide with that for BiCGStab for l=1.
*
* !!! This iterative solver produces segmentation fault when compiled with icc 11.1. Probably that is related to
* issue 146. But we leave it be (assume that this is a compiler bug). Even if someone uses this compiler, he can
* live fine without this iterative solver.
*/
{
#define LL 2 // potentially the method will also work for l=1 (but memory allocation and freeing need to be adjusted)
#define EPS1 1E-10 // for 1/|beta|
#define EPS2 1E-10 // for |u_j+1.r~|/|r_j.r~|
static doublecomplex * restrict r[LL+1],* restrict u[LL+1];
static doublecomplex matrix_z[LL+1][LL+1],y0[LL+1],yl[LL+1],zy0[LL+1],zyl[LL+1];
static int i,j;
static doublecomplex alpha,beta,omega,rho0,rho1,sigma,varrho,hatgamma,temp1;
static double kappa0,kappal,dtmp;
switch (ph) {
case PHASE_VARS:
/* rename some vectors; this doesn't contradict with 'restrict' keyword, since new names are not used
* together with old names
*/
r[0]=rvec;
r[1]=vec1;
u[0]=vec2;
u[1]=Avecbuffer;
if (LL==2) {
r[2]=vec3;
u[2]=vec4;
}
// initialize data structure for checkpoints
scalars[0].ptr=&rho0;
scalars[1].ptr=α
scalars[0].size=scalars[1].size=sizeof(doublecomplex);
vectors[0].ptr=vec2; // u[0]
vectors[0].size=sizeof(doublecomplex);
#ifdef __INTEL_COMPILER // workaround for issue 286
/* otherwise, the Intel compiler Classic (last tested for v. 2021.2) produces broken code with -O1 and
* higher, by ignoring some of the pointer assignments above r[1],r[2],u[1],u[2]. We were not able to solve
* the issue by dumb variable assignments
*/
fflush(stdout);
#endif
return;
case PHASE_INIT:
if (!load_chpoint) {
nCopy(pvec,rvec); // (pvec = r~0) = r0
rho0=-1;
}
return;
case PHASE_ITER:
// --- The BiCG part ---
for (j=0;j<LL;j++) {
rho1=nDotProd(r[j],pvec,&Timing_OneIterComm); // rho1 = r_j.r~0
// u_i = r_i - beta*u_i
if (niter==1 && j==0) nCopy(u[0],r[0]);
else {
// test for zero rho0 (1/beta)
dtmp=cabs(rho0)/(cabs(rho1)*cabs(alpha)); // assume that rho1 is not exactly zero
Dz("1/|beta|="GFORM_DEBUG,dtmp);
if (dtmp<EPS1) LogError(ONE_POS,"BCGS2 fails: 1/|beta| is too small ("GFORM_DEBUG").",dtmp);
beta=alpha*rho1/rho0;
// u_i = r_i - beta*u_i
temp1=-beta;
for (i=0;i<=j;i++) nIncrem10_cmplx(u[i],r[i],temp1,NULL,NULL);
}
rho0=rho1;
// u_j+1 = A.u_j
if (niter==1 && j==0 && matvec_ready) {} // do nothing; u[1]<=>Avecbuffer already contains matvec result
else MatVec(u[j],u[j+1],NULL,false,&Timing_OneIterMVP,&Timing_OneIterMVPComm);
sigma=nDotProd(u[j+1],pvec,&Timing_OneIterComm); // sigma = u_j+1.r~0
// test for zero sigma (1/alpha)
dtmp=cabs(sigma)/cabs(rho1); // assume that rho1 is not exactly zero
Dz("|u_%d.r~|/|r_%d.r~|="GFORM_DEBUG,j+1,j,dtmp);
if (dtmp<EPS1)
LogError(ONE_POS,"BCGS2 fails: |u_%d.r~|/|r_%d.r~| is too small ("GFORM_DEBUG").",j+1,j,dtmp);
alpha = rho1/sigma;
nIncrem01_cmplx(xvec,u[0],alpha,NULL,NULL); // x = x + alpha*u_0
// r_i = r_i - alpha*u_i+1
temp1=-alpha;
for (i=0;i<=j;i++) nIncrem01_cmplx(r[i],u[i+1],temp1,NULL,NULL);
MatVec(r[j],r[j+1],NULL,false,&Timing_OneIterMVP,&Timing_OneIterMVPComm);
}
// --- The convex polynomial part ---
// Z = R'R
for(i=0;i<=LL;i++) for (j=0;j<=i;j++) {
matrix_z[i][j]=nDotProd(r[j],r[i],&Timing_OneIterComm);
if (i!=j) matrix_z[j][i]=conj(matrix_z[i][j]);
}
// small vectors y0 and yl
y0[0]=-1;
if (LL==2) y0[1]=matrix_z[1][0]/matrix_z[1][1]; // works only for l<=2
y0[LL]=0;
yl[0]=0;
if (LL==2) yl[1]=matrix_z[1][2]/matrix_z[1][1]; // works only for l<=2
yl[LL]=-1;
// Convex combination
// compute Zy0 and Zyl
for (i=0;i<=LL;i++) {
zy0[i]=zyl[i]=0;
for (j=0;j<=LL;j++) {
zy0[i]+=matrix_z[i][j]*y0[j];
zyl[i]+=matrix_z[i][j]*yl[j];
}
}
// kappa0 = sqrt(y0.Zy0); kappal = sqrt(yl.Zyl); employs that dot products are always real
dtmp=0;
for (i=0;i<=LL;i++) dtmp+=creal(zy0[i]*conj(y0[i]));
kappa0=sqrt(dtmp);
dtmp=0;
for (i=0;i<=LL;i++) dtmp+=creal(zyl[i]*conj(yl[i]));
kappal=sqrt(dtmp);
// varrho = Zy0.yl/(kappa0*kappal)
varrho=0;
for (i=0;i<=LL;i++) varrho+=zy0[i]*conj(yl[i]);
varrho/=kappa0*kappal;
// hatgamma = varrho/abs(varrho) * max( abs(varrho),0.7)
dtmp=cabs(varrho);
hatgamma=varrho*MAX(dtmp,0.7)/dtmp;
// y0 = y0 - (hatgamma*kappa0/kappal)*yl
temp1=-hatgamma*kappa0/kappal;
for (i=0;i<=LL;i++) y0[i]+=temp1*yl[i];
// Update
omega = y0[LL];
for (i=1;i<=LL;i++) {
temp1=-y0[i];
nIncrem01_cmplx(u[0],u[i],temp1,NULL,NULL); // u_0 = u_0 - y0[i]*u_i
nIncrem01_cmplx(xvec,r[i-1],y0[i],NULL,NULL); // x = x + y0[i]*r_i-1
nIncrem01_cmplx(r[0],r[i],temp1,NULL,NULL); // r_0 = r_0 - y0[i]*r_i
}
// y0 has changed; compute Zy0 once more
for (i=0;i<=LL;i++) {
zy0[i]=0;
for (j=0;j<=LL;j++) zy0[i]+=matrix_z[i][j]*y0[j];
}
// |r|^2 = y0.Zy0
inprodRp1=0;
for (i=0;i<=LL;i++) inprodRp1+=creal(zy0[i]*conj(y0[i]));
// rho0 = -omega*rho0; moved from the beginning of the iteration
rho0*=-omega;
return; // end of PHASE_ITER
}
LogError(ONE_POS,"Unknown phase (%d) of the iterative solver",(int)ph);
}
#undef LL
#undef EPS1
#undef EPS2
//======================================================================================================================
ITER_FUNC(BiCG_CS)
/* Bi-Conjugate Gradient for Complex Symmetric systems, based on:
* Freund R.W. "Conjugate gradient-type methods for linear systems with complex symmetric coefficient matrices",
* SIAM Journal of Scientific Statistics and Computation, 13(1):425-448,1992.
*
* it is also identical to COCG, described in:
* van der Vorst H.A., Melissen J.B.M. "A Petrov-Galerkin type method for solving Ax=b, where A is symmetric complex",
* IEEE Transactions on Magnetics, 26(2):706-708, 1990.
*/
{
#define EPS1 1E-10 // for (rT.r)/(r.r)
#define EPS2 1E-10 // for (pT.A.p)/(rT.r)
static doublecomplex alpha, mu;
static doublecomplex beta,ro_new,ro_old,temp;
static double dtmp,abs_ro_new;
#ifdef OCL_BLAS
cl_mem bufro_new;
cl_mem bufmu;
cl_mem bufinprodRp1;
cl_mem bufpvec=bufargvec;
cl_mem bufAvecbuffer=bufresultvec;
#endif
switch (ph) {
case PHASE_VARS:
scalars[0].ptr=&ro_old;
scalars[0].size=sizeof(doublecomplex);
return;
case PHASE_INIT: {
#ifdef OCL_BLAS
/* This initialization part need to be moved somewhere during further adoption of clBLAS
* For now, we use braces around this case to allow internal variable declaration
*/
cl_uint major,minor,patch;
CLBLAS_CH_ERR(clblasGetVersion(&major,&minor,&patch));
if (!GREATER_EQ2(major,minor,CLBLAS_VER_REQ,CLBLAS_SUBVER_REQ)) LogError(ONE_POS,
"clBLAS library version (%u.%u) is too old. Version %d.%d or newer is required",
major,minor,CLBLAS_VER_REQ,CLBLAS_SUBVER_REQ);
D("clBLAS library version - %u.%u.%u",major,minor,patch);
D("clblasSetup started");
CLBLAS_CH_ERR(clblasSetup());
CL_CH_ERR(clEnqueueWriteBuffer(command_queue,bufpvec,CL_FALSE,0,sizeof(doublecomplex)*local_nRows,pvec,0,
NULL,NULL));
CL_CH_ERR(clEnqueueWriteBuffer(command_queue,bufrvec,CL_FALSE,0,sizeof(doublecomplex)*local_nRows,rvec,0,
NULL,NULL));
CL_CH_ERR(clEnqueueWriteBuffer(command_queue,bufxvec,CL_FALSE,0,sizeof(doublecomplex)*local_nRows,xvec,0,
NULL,NULL));
#endif
return; // no specific initialization required (if not OCL_BLAS)
}
case PHASE_ITER:
#ifdef OCL_BLAS
/* TODO: Initialization of this two and one other scalar buffers (and then their release) happens at each
* iteration. This is not a bottleneck, but still seems redundant. But to solve this problem we probably
* need to add additional phase of the iterative solver, like PHASE_RELEASE, where all such buffers can be
* released.
*/
CREATE_CL_BUFFER(bufro_new,CL_MEM_READ_WRITE,sizeof(doublecomplex),NULL);
CREATE_CL_BUFFER(bufmu,CL_MEM_READ_WRITE,sizeof(doublecomplex),NULL);
CLBLAS_CH_ERR(clblasZdotu(local_nRows,bufro_new,0,bufrvec,0,1,bufrvec,0,1,buftmp,1,&command_queue,0,NULL,
NULL));
CL_CH_ERR(clEnqueueReadBuffer(command_queue,bufro_new,CL_TRUE,0,sizeof(doublecomplex),&ro_new,0,NULL,NULL));
#else
ro_new=nDotProdSelf_conj(rvec,&Timing_OneIterComm);
#endif
// ro_k-1=r_k-1(*).r_k-1; check for ro_k-1!=0
abs_ro_new=cabs(ro_new);
dtmp=abs_ro_new/inprodR;
Dz("|rT.r|/(r.r)="GFORM_DEBUG,dtmp);
if (dtmp<EPS1) LogError(ONE_POS,"BiCG_CS fails: |rT.r|/(r.r) is too small ("GFORM_DEBUG").",dtmp);
if (niter==1) {
#ifdef OCL_BLAS
clEnqueueCopyBuffer(command_queue,bufrvec,bufpvec,0,0,sizeof(doublecomplex)*local_nRows,0,NULL,NULL);
#else
nCopy(pvec,rvec); // p_1=r_0
#endif
}
else {
// beta_k-1=ro_k-1/ro_k-2
beta=ro_new/ro_old;
// p_k=beta_k-1*p_k-1+r_k-1
#ifdef OCL_BLAS
cl_double2 clbeta = {.s={creal(beta),cimag(beta)}};
CLBLAS_CH_ERR(clblasZscal(local_nRows,clbeta,bufpvec,0,1,1,&command_queue,0,NULL,NULL));
cl_double2 clunit = {.s={1,0}};
CLBLAS_CH_ERR(clblasZaxpy(local_nRows,clunit,bufrvec,0,1,bufpvec,0,1,1,&command_queue,0,NULL,NULL));
#else
nIncrem10_cmplx(pvec,rvec,beta,NULL,NULL);
#endif
}
// q_k=Avecbuffer=A.p_k
if (niter==1 && matvec_ready) {} // do nothing, Avecbuffer is ready to use
else MatVec_wrapper(pvec,Avecbuffer,NULL,false,&Timing_OneIterMVP,&Timing_OneIterMVPComm);
// mu_k=p_k.q_k; check for mu_k!=0
#ifdef OCL_BLAS
CLBLAS_CH_ERR(clblasZdotu(local_nRows,bufmu,0,bufpvec,0,1,bufAvecbuffer,0,1,buftmp,1,&command_queue,0,NULL,
NULL));
CL_CH_ERR(clEnqueueReadBuffer(command_queue,bufmu,CL_TRUE,0,sizeof(doublecomplex),&mu,0,NULL,NULL));
#else
mu=nDotProd_conj(pvec,Avecbuffer,&Timing_OneIterComm);
#endif
dtmp=cabs(mu)/abs_ro_new;
Dz("|pT.A.p|/(rT.r)="GFORM_DEBUG,dtmp);
if (dtmp<EPS2) LogError(ONE_POS,"BiCG_CS fails: |pT.A.p|/(rT.r) is too small ("GFORM_DEBUG").",dtmp);
// alpha_k=ro_k/mu_k
alpha=ro_new/mu;
// x_k=x_k-1+alpha_k*p_k
#ifdef OCL_BLAS
cl_double2 clalpha = {.s={creal(alpha),cimag(alpha)}};
CLBLAS_CH_ERR(clblasZaxpy(local_nRows,clalpha,bufpvec,0,1,bufxvec,0,1,1,&command_queue,0,NULL,NULL));
#else
nIncrem01_cmplx(xvec,pvec,alpha,NULL,NULL);
#endif
// r_k=r_k-1-alpha_k*A.p_k and |r_k|^2
temp=-alpha;
#ifdef OCL_BLAS
cl_double2 cltemp = {.s={creal(temp),cimag(temp)}};
CREATE_CL_BUFFER(bufinprodRp1,CL_MEM_READ_WRITE,2*sizeof(double),NULL); // 2 due to workaround below
CLBLAS_CH_ERR(clblasZaxpy(local_nRows,cltemp,bufAvecbuffer,0,1,bufrvec,0,1,1,&command_queue,0,NULL,NULL));
/* kernel for function clblasDznrm2 fails to compile (during ADDA execution) with modern OpenCL
* implementations, since the latter strictly impose conformance to the standard. Since, the compilation
* options for these kernels are not accessible, here we use a workaround through the complex dot-product
* function. This workaround requires twice larger memory for result, but twice smaller for buftmp, and
* returns the square of the norm.
* TODO: Since the clBlas is no more developed, this will stay here until we switch to some other library.
* The commented out parts will then facilitate reverting to calling a norm function.
*/
//CLBLAS_CH_ERR(clblasDznrm2(local_nRows,bufinprodRp1,0,bufrvec,0,1,buftmp,1,&command_queue,0,NULL,NULL));
CLBLAS_CH_ERR(clblasZdotc(local_nRows,bufinprodRp1,0,bufrvec,0,1,bufrvec,0,1,buftmp,1,&command_queue,0,NULL,NULL));
CL_CH_ERR(clEnqueueReadBuffer(command_queue,bufinprodRp1,CL_TRUE,0,sizeof(double),&inprodRp1,0,NULL,NULL));
//inprodRp1=inprodRp1*inprodRp1; // dot product returns already squared norm
#else
nIncrem01_cmplx(rvec,Avecbuffer,temp,&inprodRp1,&Timing_OneIterComm);
#endif
#ifdef OCL_BLAS
CL_CH_ERR(clFinish(command_queue)); // finish queue before freeing resources
my_clReleaseBuffer(bufinprodRp1);
my_clReleaseBuffer(bufro_new);
my_clReleaseBuffer(bufmu);
if (inprodRp1<epsB){
CL_CH_ERR(clEnqueueReadBuffer(command_queue,bufpvec,CL_FALSE,0,sizeof(doublecomplex)*local_nRows,pvec,0,
NULL,NULL));
CL_CH_ERR(clEnqueueReadBuffer(command_queue,bufrvec,CL_FALSE,0,sizeof(doublecomplex)*local_nRows,rvec,0,
NULL,NULL));
CL_CH_ERR(clEnqueueReadBuffer(command_queue,bufxvec,CL_TRUE,0,sizeof(doublecomplex)*local_nRows,xvec,0,
NULL,NULL));
}
#endif
// initialize ro_old -> ro_k-2 for next iteration
ro_old=ro_new;
return; // end of PHASE_ITER
}
LogError(ONE_POS,"Unknown phase (%d) of the iterative solver",(int)ph);
}
#undef EPS1
#undef EPS2
//======================================================================================================================
ITER_FUNC(BiCGStab)
/* Bi-Conjugate Gradient Stabilized, based on
* "Templates for the Solution of Linear Systems: Building Blocks for Iterative Methods",
* http://www.netlib.org/templates/Templates.html .
*/
{
#define EPS1 1E-10 // for 1/|beta|
#define EPS2 1E-10 // for |v.r~|/|r.r~|
static double denumOmega,dtmp;
static doublecomplex beta,ro_new,ro_old,omega,alpha,temp1,temp2;
static doublecomplex * restrict v,* restrict s,* restrict rtilda;
switch (ph) {
case PHASE_VARS:
/* rename some vectors; this doesn't contradict with 'restrict' keyword, since new names are not used
* together with old names
*/
v=vec1;
s=vec2;
rtilda=vec3;
// initialize data structure for checkpoints
scalars[0].ptr=&ro_old;
scalars[1].ptr=ω
scalars[2].ptr=α
scalars[0].size=scalars[1].size=scalars[2].size=sizeof(doublecomplex);
vectors[0].ptr=vec1; // v
vectors[1].ptr=vec2; // s
vectors[2].ptr=vec3; // rtilda
vectors[0].size=vectors[1].size=vectors[2].size=sizeof(doublecomplex);
return;
case PHASE_INIT:
if (!load_chpoint) nCopy(rtilda,rvec); // r~=r_0
return;
case PHASE_ITER:
// ro_k-1=r_k-1.r~ ; check for ro_k-1!=0
ro_new=nDotProd(rvec,rtilda,&Timing_OneIterComm);
if (niter==1) nCopy(pvec,rvec); // p_1=r_0
else {
// beta_k-1=(ro_k-1/ro_k-2)*(alpha_k-1/omega_k-1)
temp1=ro_new*alpha;
temp2=ro_old*omega;
// check that omega_k-1!=0; assume that ro_new is not exactly zero
dtmp=cabs(temp2)/cabs(temp1);
Dz("1/|beta|="GFORM_DEBUG,dtmp);
if (dtmp<EPS1) LogError(ONE_POS,"BiCGStab fails: 1/|beta| is too small ("GFORM_DEBUG").",dtmp);
beta=temp1/temp2;
// p_k=beta_k-1*(p_k-1-omega_k-1*v_k-1)+r_k-1
temp1=-beta*omega;
nIncrem110_cmplx(pvec,v,rvec,beta,temp1);
}
// calculate v_k=A.p_k
if (niter==1 && matvec_ready) nCopy(v,Avecbuffer);
else MatVec(pvec,v,NULL,false,&Timing_OneIterMVP,&Timing_OneIterMVPComm);
// alpha_k=ro_new/(v_k.r~)
temp1=nDotProd(v,rtilda,&Timing_OneIterComm);
dtmp=cabs(temp1)/cabs(ro_new); // assume that ro_new is not exactly zero
Dz("|v.r~|/|r.r~|="GFORM_DEBUG,dtmp);
if (dtmp<EPS2) LogError(ONE_POS,"BiCGStab fails: |v.r~|/|r.r~| is too small ("GFORM_DEBUG").",dtmp);
alpha=ro_new/temp1;
// s=r_k-1-alpha*v_k-1
temp1=-alpha;
nLinComb1_cmplx(s,v,rvec,temp1,&inprodRp1,&Timing_OneIterComm);
// check convergence at this step; if yes, checkpoint should not be saved afterwards
if (inprodRp1<epsB && chp_type!=CHP_ALWAYS) {
// x_k=x_k-1+alpha_k*p_k
nIncrem01_cmplx(xvec,pvec,alpha,NULL,NULL);
complete=false;
}
else {
// t=Avecbuffer=A.s
MatVec(s,Avecbuffer,&denumOmega,false,&Timing_OneIterMVP,&Timing_OneIterMVPComm);
// omega_k=s.t/|t|^2
omega=nDotProd(s,Avecbuffer,&Timing_OneIterComm)/denumOmega;
// x_k=x_k-1+alpha_k*p_k+omega_k*s
nIncrem011_cmplx(xvec,pvec,s,alpha,omega);
// r_k=s-omega_k*t and |r_k|^2
temp1=-omega;
nLinComb1_cmplx(rvec,Avecbuffer,s,temp1,&inprodRp1,&Timing_OneIterComm);
// initialize ro_old -> ro_k-2 for next iteration
ro_old=ro_new;
}
return; // end of PHASE_ITER
}
LogError(ONE_POS,"Unknown phase (%d) of the iterative solver",(int)ph);
}
#undef EPS1
#undef EPS2
//======================================================================================================================
ITER_FUNC(CGNR)
/* Conjugate Gradient applied to Normalized Equations with minimization of Residual Norm, based on
* "Templates for the Solution of Linear Systems: Building Blocks for Iterative Methods",
* http://www.netlib.org/templates/Templates.html .
*/
{
static double alpha, denumeratorAlpha;
static double beta,ro_new,ro_old;
switch (ph) {
case PHASE_VARS:
scalars[0].ptr=&ro_old;
scalars[0].size=sizeof(double);
return;
case PHASE_INIT: return; // no specific initialization required
case PHASE_ITER:
// p_1=Ah.r_0 and ro_new=ro_0=|Ah.r_0|^2
// since first product is with Ah , matvec_ready can't be employed
if (niter==1) MatVec(rvec,pvec,&ro_new,true,&Timing_OneIterMVP,&Timing_OneIterMVPComm);
else {
// Avecbuffer=AH.r_k-1, ro_new=ro_k-1=|AH.r_k-1|^2
MatVec(rvec,Avecbuffer,&ro_new,true,&Timing_OneIterMVP,&Timing_OneIterMVPComm);
// beta_k-1=ro_k-1/ro_k-2
beta=ro_new/ro_old;
// p_k=beta_k-1*p_k-1+AH.r_k-1
nIncrem10(pvec,Avecbuffer,beta,NULL,NULL);
}
// alpha_k=ro_k-1/|A.p_k|^2
// Avecbuffer=A.p_k
MatVec(pvec,Avecbuffer,&denumeratorAlpha,false,&Timing_OneIterMVP,&Timing_OneIterMVPComm);
alpha=ro_new/denumeratorAlpha;
// x_k=x_k-1+alpha_k*p_k
nIncrem01(xvec,pvec,alpha,NULL,NULL);
// r_k=r_k-1-alpha_k*A.p_k and |r_k|^2
nIncrem01(rvec,Avecbuffer,-alpha,&inprodRp1,&Timing_OneIterComm);
// initialize ro_old -> ro_k-2 for next iteration
ro_old=ro_new;
return; // end of PHASE_ITER
}
LogError(ONE_POS,"Unknown phase (%d) of the iterative solver",(int)ph);
}
//======================================================================================================================
ITER_FUNC(CSYM)
/* Bi-Conjugate Gradient for Complex Symmetric systems, based on
* A. Bunse-Gerstner and R. Stover, "On a conjugate gradient-type method for solving complex symmetric linear systems,"
* Lin. Alg. Appl. 287, 105-123 (1999). with rearrangement of operations (MatVec is now calculated in the beginning of
* the iteration)
*
* Consumes one less vector than QMR-CS, because rvec does not need to be explicitly computed. The residual should
* always decrease and always be smaller than that of CGNR (for the same number of matrix-vector products).
*/
{
static doublecomplex alpha,gamma,invksi,theta,eta,tau,temp1,temp2,s_old,s_new;
static double dtmp,beta,c_old,c_new;
static doublecomplex *q_new,*q_old,*p_new,*p_old; // can't be declared restrict due to SwapPointers
switch (ph) {
case PHASE_VARS:
// rename some vectors
q_new=rvec; // q_k
q_old=vec1; // q_k-1
p_new=pvec; // p_k-1
p_old=vec2; // p_k-2
// initialize data structure for checkpoints
scalars[0].ptr=β
scalars[1].ptr=&c_old;
scalars[2].ptr=&c_new;
scalars[3].ptr=τ
scalars[4].ptr=&s_old;
scalars[5].ptr=&s_new;
scalars[0].size=scalars[1].size=scalars[2].size=sizeof(double);
scalars[3].size=scalars[4].size=scalars[5].size=sizeof(doublecomplex);
vectors[0].ptr=vec1; // now it is q_old, but can be changed further by swapping
vectors[1].ptr=vec2; // now it is p_old, but can be changed further by swapping
vectors[0].size=vectors[1].size=sizeof(doublecomplex);
return;
case PHASE_INIT:
if (load_chpoint) { // change pointers names according to count parity
/* change pointers names according to count parity. Based on the fact that first two are swapped at each
* iteration (and niter>=1), while second two - at each iteration starting from niter=2.
*/
if (IS_EVEN(niter)) SwapPointers(&q_old,&q_new);
else if (niter>1) SwapPointers(&p_old,&p_new);
}
else {
// tau_1 = ||r_0||; q_1 = r_0(*)/||r_0||; here r_0 is already stored in q_old
tau=sqrt(inprodR);
nMultSelf_conj(q_new,1/creal(tau));
// c_0=1; c_-1=0; s_0=s_-1=0
c_new=1;
c_old=0;
s_new=s_old=0;
}
return;
case PHASE_ITER:
/* Avecbuffer = A.q_k. Since q_1 is r_0(*), mat-vec product for niter==1 is equivalent to Ah.r_0 (as in
* CGNR). Thus, matvec_ready can't be employed.
*/
MatVec(q_new,Avecbuffer,NULL,false,&Timing_OneIterMVP,&Timing_OneIterMVPComm);
// alpha_k = q_k(T).A.q_k
alpha=nDotProd_conj(q_new,Avecbuffer,&Timing_OneIterComm);
// eta_k = c_k-2*c_k-1*beta_k + s_k-1(*)*alpha_k
eta = c_old*c_new*beta + alpha*conj(s_new);
// gamma_k = c_k-1*alpha_k - c_k-2*s_k-1*beta_k
gamma = c_new*alpha - c_old*s_new*beta;
// theta_k = s_k-2(*)*beta_k
theta=beta*conj(s_old);
// w = Aq_k - alpha_k*q_k(*) - beta_k*q_k-1(*); w is stored in q_old
temp1=-alpha; // temp1 = -alpha_k
// use explicitly that q_0=0
if (niter==1) nLinComb1_cmplx_conj(q_old,q_new,Avecbuffer,temp1,&dtmp,&Timing_OneIterComm);
else nIncrem110_d_c_conj(q_old,q_new,Avecbuffer,-beta,temp1,&dtmp,&Timing_OneIterComm);
// beta_k+1 = ||w|| (after that beta is beta_k+1)
// if beta=0 this is the last iteration, following formulae work fine in this case
beta=sqrt(dtmp);
// (k-1)-th values are moved to "old", while new (k-th) values are calculated next
c_old=c_new;
s_old=s_new;
dtmp=cabs(gamma);
if (dtmp==0) {
// the following condition should never occur
if (beta==0) LogError(ONE_POS,"Fatal error in CSYM iterative solver. Interaction matrix is singular");
c_new=0;
s_new=1;
invksi=1/beta;
}
else {
// c_k = |gamma_k| / sqrt(|gamma_k|^2 + beta_k+1^2), computed to avoid overflows
if (dtmp<beta) {
dtmp=dtmp/beta;
c_new=dtmp/sqrt(1+dtmp*dtmp);
}
else {
dtmp=beta/dtmp;
c_new=1/sqrt(1+dtmp*dtmp);
}
// 1/ksi_k = c_k/gamma_k
invksi=c_new/gamma;
// s_k = beta_k+1/ksi_k = beta_k+1*c_k/gamma_k
s_new=beta*invksi;
}
// p_k=(-theta_k*p_k-2-eta_k*p_k-1+q_k)/ksi_k
if (niter==1) nMult_cmplx(p_new,q_new,invksi); // use implicitly that p_0=p_-1=0
else {
temp1=-eta*invksi;
if (niter==2) nLinComb_cmplx(p_old,p_new,q_new,temp1,invksi,NULL,NULL); // use explicitly that p_0=0
else {
temp2=-theta*invksi;
nIncrem111_cmplx(p_old,p_new,q_new,temp2,temp1,invksi);
}
SwapPointers(&p_old,&p_new);
}
// x_k=x_k-1+tau_k*c_k*p_k
temp1=c_new*tau;
nIncrem01_cmplx(xvec,p_new,temp1,NULL,NULL);
// q_k+1 = w(*)/beta_k+1; it is first stored into q_old and then swapped
nMultSelf_conj(q_old,1/beta);
SwapPointers(&q_old,&q_new);
// tau_k+1 = -s_k*tau_k; ||r_k|| = |tau_k+1|
tau*=-s_new;
inprodRp1=cAbs2(tau);
#ifdef WORKAROUND146
dumb=tau;
#endif
return; // end of PHASE_ITER
}
LogError(ONE_POS,"Unknown phase (%d) of the iterative solver",(int)ph);
}
//======================================================================================================================
ITER_FUNC(QMR_CS)
/* Quasi Minimum Residual for Complex Symmetric systems, based on: