#include <cstdlib>
#include <cmath>
#include <iostream>
#include <iomanip>
#include <sstream>
#include <set>
#include <vector>
#include <deque>
#include <map>
#include <algorithm>
#include <list>
#include <fstream>
#include "smesh.h"
#include "sfunction.h"
#include "random.h"
#include "number.h"
#include "mpi.h"
#include "common.h"
#include "intervals.h"
#include "matrixm.h"
#include "state.h"
#include "inout.h"
#include "operators.h"
#include "local.h"
#include "stateim.h"
#include "hb1.h"

using namespace std;

template <class Rand>
class CTQMC{
  Rand& rand;                     // random number generator

  ClusterData& cluster;           // information about the atomic states of the particular problem
  vector<nIntervals> intervals;   // Stores information to compute the bath determinant and green's function: times and types corresponding to each bath sorted in the proper time order
  NOperators Operators;           // Stores information to compute local trace: all times and types in the proper time order.
  
  function1D<NState> npraStates;  // list of all atomic states
  function2D<NState> state_evolution_left, state_evolution_right; // evolution of atomic states for fast update of atomic trace
  vector<function2D<double> > MD; // matrix of the inverse of delta which is needed to computer local Green's function
  vector<MatrixM> TMD;            // class which manipulates matrix of deltas and local green's function
  vector<spline1D<double> > Delta;// Delta in imaginary time
  vector<function2D<spline1D<double>* > > Deltat; // Delta in imaginary time but in matrix form (needed for non-diagonal cases)
  
  Number matrix_element;          // current matrix element due to local part of the system
  function1D<Number> Trace;       // current atomic trace (for computing average)
  vector<double> detD;            // bath determinants
  
  mesh1D tau;                     // imaginary time mesh
  mesh1D iom;                     // small imaginary frequency mesh on which local Green's function is manipulated
  mesh1D biom;                    // small bosonic imaginary frequency mesh for susceptibilities
  
  function1D<double> histogram;   // histogram
  
  int Naver;                      // Number of measurements
  vector<function2D<dcomplex> > Gaver; // Current approximation for the local Green's function
  function1D<double> observables; // some zero time quantities needed for calculation of occupancies and susceptibilities
  function1D<double> aver_observables; // the same zero-time quantities but the average in simulation rather than the current state
  function2D<double> Prob;        // Probability for each atomic state in current configuration
  function2D<double> AProb;       // Average Probability
  function1D<double> kaver;       // average perturbation order for each bath - measures kinetic energy
  int nsusc;
  function2D<double> susc;        // current frequency dependent susceptibility
  function2D<double> aver_susc;   // sampled frequency dependent susceptibility
  function2D<dcomplex> dsusc;       // temporary variable for frequency dependent susceptibility
  function2D<dcomplex> dsusc1;       // temporary variable for frequency dependent susceptibility
  
  vector<function2D<double> > Gtau; // Green's function in imaginary time (not so precise)
  int NGta;                       // number of measurements of Gtau
  function2D<dcomplex> dexp, dexp1;      // differences e^{iom*tau_{l+1}}-e^{iom*tau_l} needed for frequency dependent response functions
  int successful;                 // number of successful moves

  function2D<dcomplex> vertex;
  
  function1D<double> dtau;        // temporary for computeaverage
  function2D<double> tMD;         // just temporary used in global flip
  nIntervals tinterval;           // just temporary used in global flip
  function2D<dcomplex> bexp;      // temporary for e^{iom*tau}
  
public:
  
  CTQMC(Rand& rand_, ClusterData& cluster_, int Nmax, const mesh1D& iom_large, const function2D<dcomplex>& Deltaw, const vector<double>& ah, int nom, int nomb, int Ntau, bool IamMaster);
  ~CTQMC();
  
  double sample(int max_steps); // main function which does the Monte Carlo sampling
 
  void RecomputeCix(); // tried to find better atomic base for the next iteration
  void ComputeFinalAverage(function1D<double>& mom); // From atomic probabilities computes various zero time quantities
  
  bool SaveStatus(int my_rank);     // saves all kinks to disc for shorter warmup in the next step
  bool RetrieveStatus(int my_rank); // retrieves all kinks from disc
  
  const mesh1D& ioms() const { return iom;}
  const mesh1D& iomb() const { return biom;}
  function1D<double>& Histogram() {return histogram;}
  function2D<double>& AverageProbability() {return AProb;}
  function1D<double>& Observables() {return aver_observables;}
  function1D<double>& k_aver() {return kaver;}
  vector<function2D<dcomplex> >& G_aver() { return Gaver;}
  const vector<function2D<double> >& gtau() const { return Gtau;}
  function2D<double>& Susc() {return aver_susc;}
private:
  void Add_One_Kink(int i, int ifl);
  void Remove_One_Kink(int i, int ifl);
  void Move_A_Kink(int i, int ifl);
  void Add_Two_Kinks(int i, int ifl);
  void Remove_Two_Kinks(int i, int ifl);
  void ComputeAverage();
  void CleanUpdate(int i);
  void StoreCurrentState(function1D<double>& aver);
  void PrintInfo(int i, const function1D<double>& aver);
  void GlobalFlip(int i);
};

template <class Rand>
CTQMC<Rand>::CTQMC(Rand& rand_, ClusterData& cluster_, int Nmax, const mesh1D& iom_large, const function2D<dcomplex>& Deltaw, const vector<double>& ah, int nom, int nomb, int Ntau, bool IamMaster) :
  rand(rand_), cluster(cluster_), intervals(common::N_ifl), Operators(Nmax, cluster),
  npraStates(cluster.nsize), state_evolution_left(cluster.nsize,Nmax), state_evolution_right(cluster.nsize,Nmax),
  MD(common::N_ifl), TMD(common::N_ifl), Delta(cluster.N_unique_fl), Deltat(cluster.N_ifl), Trace(cluster.nsize), detD(common::N_ifl),
  iom(nom), biom(nomb), histogram(Nmax),
  Naver(0), Gaver(cluster.N_ifl), observables(4), aver_observables(4),
  Prob(cluster.nsize,common::max_size), AProb(cluster.nsize,common::max_size),
  kaver(common::N_ifl), nsusc(2), susc(nsusc,nomb), aver_susc(nsusc,nomb), dsusc(nomb,nsusc), dsusc1(nsusc*cluster.nsize, nomb),
  Gtau(common::N_ifl), NGta(0), dexp(Nmax+1,nom), dexp1(Nmax+1,nom), successful(0), 
  vertex(nsusc*cluster.nsize, Nmax+1), dtau(Nmax+1), bexp(Nmax,max(nom,nomb))
{
  // all atomic states are generated
  for (int j=0; j<npraStates.size(); j++) npraStates[j].SetPraState(j,cluster);
  
  DeltaFourier(Ntau, common::beta, tau, Delta, iom_large, Deltaw, ah);
  
  int ibath=0;
  int ntau=tau.size();
  for (int ifl=0; ifl<cluster.N_ifl; ifl++){
    Deltat[ifl].resize(cluster.ifl_dim[ifl],cluster.ifl_dim[ifl]);
    for (int i1=0; i1<cluster.ifl_dim[ifl]; i1++){
      for (int i2=0; i2<cluster.ifl_dim[ifl]; i2++){
	Deltat[ifl][i1][i2] = new spline1D<double>(Ntau);
	int ib = cluster.tfl_index[ifl][i1][i2];
	int fl = cluster.bfl_index[ifl][ib];
	for (int it=0; it<tau.size(); it++){
	  double Dt = Delta[fl][it];
	  // This is new change! Checks for diagonal Deltas to be causal! changed 14.2.2007
	  if (i1==i2 && Dt>-common::minDeltat) Dt = -common::minDeltat; // might not work if Delta(tau)>0
			    
	  if (cluster.conjg[ifl][ib]) Dt = -Delta[fl][ntau-it-1];
	  Dt *= cluster.sign[ifl][ib];
	  (*Deltat[ifl][i1][i2])[it] = Dt;
	}
	double df0 = ((*Deltat[ifl][i1][i2])[1]-(*Deltat[ifl][i1][i2])[0])/(tau[1]-tau[0]);
	int n = tau.size();
	double dfn = ((*Deltat[ifl][i1][i2])[n-1]-(*Deltat[ifl][i1][i2])[n-2])/(tau[n-1]-tau[n-2]);
	(*Deltat[ifl][i1][i2]).splineIt(tau,df0,dfn);
	ibath++;
      }
    }
  }

  if (IamMaster){
    for (int ifl=0; ifl<cluster.N_ifl; ifl++){
      for (int i1=0; i1<cluster.ifl_dim[ifl]; i1++){
	for (int i2=0; i2<cluster.ifl_dim[ifl]; i2++){
	  ofstream out(NameOfFile_("Delta.tau",ifl,i1*2+i2).c_str());
	  for (int it=0; it<1000; it++){
	    double t = it*common::beta/(1000.-1);
	    double Dt = (*Deltat[ifl][i1][i2])(tau.Interp(t));
	    out<<t<<" "<<Dt<<endl;
	  }
	}
      }
    }
  }
  for (int i=0; i<iom.size(); i++) iom[i] = iom_large[i];
  
  for (int i=0; i<nomb; i++) biom[i] = 2*i*M_PI/common::beta;
    
  // propagators are set-up
  for (size_t ifl=0; ifl<intervals.size(); ifl++) intervals[ifl].SetUp(Nmax,iom,biom,common::beta,cluster.ifl_dim[ifl]);
  tinterval.SetUp(Nmax,iom,biom,common::beta,0);
  // matrix M is set up
  for (size_t ifl=0; ifl<MD.size(); ifl++) MD[ifl].resize(Nmax,Nmax);
  for (size_t ifl=0; ifl<TMD.size(); ifl++) TMD[ifl].SetUp(Nmax, tau, Deltat[ifl], cluster.tfl_index[ifl], iom, common::beta);

  // set-up local part of the system
  StartStateEvolution(state_evolution_left, state_evolution_right, npraStates, Operators, cluster, Trace, Prob);
  // approximation for Glocal
  for (int ifl=0; ifl<cluster.N_ifl; ifl++){
    Gaver[ifl].resize(cluster.N_baths(ifl),iom.size());
    Gaver[ifl] = 0;
  }

  for (size_t ifl=0; ifl<detD.size(); ifl++) detD[ifl] = 1;
  
  for (int j=0; j<cluster.nsize; j++){
    Prob[j].resize(cluster.msize(j+1));
    AProb[j].resize(cluster.msize(j+1));
  }

  if (common::SampleGtau>0)
    for (size_t ifl=0; ifl<Gtau.size(); ifl++){
      Gtau[ifl].resize(sqr(cluster.ifl_dim[ifl]),Ntau);
      Gtau[ifl]=0;
    }
  
  kaver = 0;
  aver_susc = 0;
}

template <class Rand>
CTQMC<Rand>::~CTQMC()
{
  for (size_t ifl=0; ifl<Deltat.size(); ifl++)
    for (int i1=0; i1<Deltat[ifl].size_N(); i1++)
      for (int i2=0; i2<Deltat[ifl].size_Nd(); i2++)
	delete Deltat[ifl][i1][i2];
}
    

template <class Rand>
void CTQMC<Rand>::Add_One_Kink(int i, int ifl)
{
  if (Operators.full()) return;
  int dim = cluster.ifl_dim[ifl];
  int kn = intervals[ifl].size()/2;
  int bfls = static_cast<int>(dim*rand());
  int bfle = static_cast<int>(dim*rand());
  int fls = cluster.fl_from_ifl[ifl][bfls];
  int fle = cluster.fl_from_ifl[ifl][bfle];
	  
  double t_start = common::beta*rand();
  double t_end = common::beta*rand();
	  
  bool ssc1 = Operators.Try_Add_Cd_C_(fls, t_start, fle, t_end, state_evolution_left, state_evolution_right);
  
  if (t_start!=t_end && ssc1){
    double ratioD = TMD[ifl].AddDetRatio(MD[ifl], kn, t_start, bfls, t_end, bfle, intervals[ifl]);
	  
    pair<int,int> opt = Operators.Try_Add_Cd_C(fls, t_start, fle, t_end);
    int op_i = min(opt.first, opt.second), op_f = max(opt.first, opt.second);
    Number ms_new = ComputeTrace(state_evolution_left, state_evolution_right, op_i, op_f, Operators, npraStates, cluster, 2);
    double ms = fabs(divide(ms_new, matrix_element));
    double P = sqr(common::beta*dim)/sqr(kn+1.)*(fabs(ratioD)*ms);
    //    cout<<"i "<<setw(5)<<i<<" "<<setw(18)<<P<<" "<<setw(18)<<ms<<" "<<setw(18)<<ratioD<<" "<<setw(4)<<Operators.size()<<endl;
    if (ms>common::minD && 1-rand()<P){
      int is, ie;
      intervals[ifl].Find_is_ie(t_start, is, t_end, ie);
      ratioD = TMD[ifl].AddUpdateMatrix(MD[ifl], kn, t_start, is, bfls, t_end, ie, bfle, intervals[ifl]);
      pair<int,int> ipl = intervals[ifl].InsertExponents(t_start, is, bfls, t_end, ie, bfle);
      TMD[ifl].AddUpdate_Gf(intervals[ifl]);
      
      pair<int,int> opt = Operators.Add_Cd_C(fls, t_start, fle, t_end,   IntervalIndex(ifl, nIntervals::cd, ipl.first), IntervalIndex(ifl, nIntervals::c, ipl.second));
      // And updates state evolution for fast computation of Trace
      Number new_matrix_element = UpdateStateEvolution(state_evolution_left, state_evolution_right, op_i, op_f, Operators, npraStates, cluster, Trace, Prob);
      
	    
      detD[ifl] *= ratioD;
      matrix_element = new_matrix_element;
      successful++;
      
      ComputeAverage();
    }
  }
}


template <class Rand>
void CTQMC<Rand>::Remove_One_Kink(int i, int ifl)
{
  int kn = intervals[ifl].size()/2;
  if (kn==0) return;
  int dim = cluster.ifl_dim[ifl];
  int ie = static_cast<int>(kn*rand());
  int is = static_cast<int>(kn*rand());
  int bfle = intervals[ifl].btype_e(ie);
  int bfls = intervals[ifl].btype_s(is);
  int iop_s = intervals[ifl].FindSuccessive(0, is, bfls);
  int iop_e = intervals[ifl].FindSuccessive(1, ie, bfle);
  int fle = cluster.fl_from_ifl[ifl][bfle];
  int fls = cluster.fl_from_ifl[ifl][bfls];
  int ipe, ips;
  bool ssc1 = Operators.Try_Remove_C_Cd_(fle, iop_e, fls, iop_s, ipe, ips, state_evolution_left, state_evolution_right);
    
  if (ssc1){
    double ratioD = TMD[ifl].RemoveDetRatio(MD[ifl], kn, is, ie);
    int op_i = min(ipe, ips);
    int op_f = ipe>ips ? ipe-2 : ips-1;
    Operators.Try_Remove_C_Cd(ipe, ips);
    Number ms_new = ComputeTrace(state_evolution_left, state_evolution_right, op_i, op_f, Operators, npraStates, cluster, -2);
    double ms = fabs(divide(ms_new,matrix_element));
    double P = (fabs(ratioD)*ms)*sqr(kn)/sqr(common::beta*dim);
    //    cout<<"r "<<setw(5)<<i<<" "<<setw(18)<<P<<" "<<setw(18)<<ms<<" "<<setw(18)<<ratioD<<" "<<setw(4)<<Operators.size()<<endl;
    if (ms>common::minD && 1-rand()<P){
      
      Operators.Remove_C_Cd(ipe, ips);
      // And updates state evolution for fast computation of Trace
      Number new_matrix_element = UpdateStateEvolution(state_evolution_left, state_evolution_right, op_i, op_f, Operators, npraStates, cluster, Trace, Prob);
      
      TMD[ifl].RemoveUpdate_Gf(MD[ifl], kn, is, ie, intervals[ifl]);
	
      ratioD = TMD[ifl].RemoveUpdateMatrix(MD[ifl], kn, is, ie, intervals[ifl]);
      intervals[ifl].RemoveExponents(is,ie);
      detD[ifl] *= ratioD;
      matrix_element = new_matrix_element;
      successful++;
	
      ComputeAverage();
    }
  }
}

template <class Rand>
void CTQMC<Rand>::Move_A_Kink(int i, int ifl)
{
  int kn = intervals[ifl].size()/2;
  if (kn==0) return;
  //  int dim = cluster.ifl_dim[ifl];
  int type = static_cast<int>(rand()*2);
  int to_move = static_cast<int>(kn*rand());
  int bfl = intervals[ifl].btype(type, to_move);
  int fl = cluster.fl_from_ifl[ifl][bfl];
  int opera = 2*fl+type;
  int to_move_o = intervals[ifl].FindSuccessive(type, to_move, bfl);
	  
  double t_old = intervals[ifl].time(type,to_move);
  double t_prev = intervals[ifl].PreviousTime(type,to_move);
  double t_next = intervals[ifl].NextTime(type, to_move);
	
  double t_new = t_prev + (t_next-t_prev)*rand();
  if (t_new<0) t_new += common::beta;
  if (t_new>common::beta) t_new -= common::beta;
	
  bool ssc1 = Operators.Try_Move_(opera, t_old, to_move_o, t_new, state_evolution_left, state_evolution_right);
  if (ssc1){
    double ratioD;
    if (type==0) ratioD = TMD[ifl].Move_start_DetRatio(MD[ifl], kn, t_old, t_new, bfl, to_move, intervals[ifl]);
    else ratioD = TMD[ifl].Move_end_DetRatio(MD[ifl], kn, t_old, t_new, bfl, to_move, intervals[ifl]);
    int ip_old, ipo, ip_new;
    Operators.TryMove(opera, t_old, to_move_o, ip_old, ipo, t_new, ip_new);
    int op_i = min(ipo, ip_new), op_f = max(ipo, ip_new);
    Number ms_new = ComputeTrace(state_evolution_left, state_evolution_right, op_i, op_f, Operators, npraStates, cluster, 0);
    double ms = fabs(divide(ms_new, matrix_element));
    double P = fabs(ratioD*ms);
    //	  cout<<"m "<<setw(5)<<i<<" "<<setw(10)<<P<<" "<<setw(10)<<ms<<" "<<setw(10)<<ratioD<<endl;
    if (ms>common::minD && 1-rand()<P){
      Operators.Move(ip_old, ip_new);
      Number new_matrix_element = UpdateStateEvolution(state_evolution_left, state_evolution_right, op_i, op_f, Operators, npraStates, cluster, Trace, Prob);
      int i_new = intervals[ifl].FindIndex(t_new, t_old, type);
      if (type==0) TMD[ifl].Move_start_UpdateMatrix(MD[ifl], kn, t_old, t_new, bfl, to_move, i_new, intervals[ifl]);
      else TMD[ifl].Move_end_UpdateMatrix(MD[ifl], kn, t_old, t_new, bfl, to_move, i_new, intervals[ifl]);
      intervals[ifl].MoveExponents(type, t_old, to_move, t_new, i_new);
      TMD[ifl].MoveUpdate_Gf(intervals[ifl]);
      detD[ifl] *= ratioD;
      matrix_element = new_matrix_element;
      successful++;
      
      ComputeAverage();
    }
  }
}

template <class Rand>
void CTQMC<Rand>::Add_Two_Kinks(int i, int ifl)
{
  if (Operators.full()) return;
  int dim = cluster.ifl_dim[ifl];
  int kn = intervals[ifl].size()/2;
  int bfls1 = static_cast<int>(dim*rand());
  int bfle1 = static_cast<int>(dim*rand());
  int bfls2 = static_cast<int>(dim*rand());
  int bfle2 = static_cast<int>(dim*rand());
  int fls1 = cluster.fl_from_ifl[ifl][bfls1];
  int fle1 = cluster.fl_from_ifl[ifl][bfle1];
  int fls2 = cluster.fl_from_ifl[ifl][bfls2];
  int fle2 = cluster.fl_from_ifl[ifl][bfle2];

  double t_start1 = common::beta*rand();
  double t_end1 = common::beta*rand();
  double t_start2 = common::beta*rand();
  double t_end2 = common::beta*rand();
	  
  bool ssc1 = Operators.Try_Add_2_Cd_2_C_(fls1, t_start1, fle1, t_end1, fls2, t_start2, fle2, t_end2, state_evolution_left, state_evolution_right);

  if (t_start1==t_end1 || t_start2==t_end2 || t_start1==t_start2 || t_end1==t_end2) return;
  
  if (ssc1){
    double ratioD = TMD[ifl].AddDetRatio(MD[ifl], kn, bfls1, t_start1, bfle1, t_end1, bfls2, t_start2, bfle2, t_end2, intervals[ifl]);

    pair<int,int> opt = Operators.Try_Add_2_Cd_2_C(fls1, t_start1, fle1, t_end1, fls2, t_start2, fle2, t_end2);
    int op_i = min(opt.first, opt.second), op_f = max(opt.first, opt.second);
    Number ms_new = ComputeTrace(state_evolution_left, state_evolution_right, op_i, op_f, Operators, npraStates, cluster, 4);
    double ms = fabs(divide(ms_new, matrix_element));
    double P = sqr(common::beta*dim/(kn+1.))*sqr(common::beta*dim/(kn+2.))*(fabs(ratioD)*ms);
    //    cout<<"2 "<<setw(5)<<i<<" "<<setw(10)<<P<<" "<<setw(10)<<ms<<" "<<setw(10)<<ratioD<<" "<<kn<<endl;
    if (1-rand()<P && ms>common::minD){
      
      int is1, ie1;
      intervals[ifl].Find_is_ie(t_start1, is1, t_end1, ie1);
      double ratioD1 = TMD[ifl].AddUpdateMatrix(MD[ifl], kn, t_start1, is1, bfls1, t_end1, ie1, bfle1, intervals[ifl]);
      pair<int,int> ipl1 = intervals[ifl].InsertExponents(t_start1, is1, bfls1, t_end1, ie1, bfle1);
      TMD[ifl].AddUpdate_Gf(intervals[ifl]);
      
      pair<int,int> opt1 = Operators.Add_Cd_C(fls1, t_start1, fle1, t_end1, IntervalIndex(ifl, nIntervals::cd, ipl1.first), IntervalIndex(ifl, nIntervals::c, ipl1.second));
      
      int op_i1 = min(opt1.first, opt1.second);
      int op_f1 = max(opt1.first, opt1.second);
      Number new_matrix_element1 = UpdateStateEvolution(state_evolution_left, state_evolution_right, op_i1, op_f1, Operators, npraStates, cluster, Trace, Prob);
      

      
      int is2, ie2;
      intervals[ifl].Find_is_ie(t_start2, is2, t_end2, ie2);
      double ratioD2 = TMD[ifl].AddUpdateMatrix(MD[ifl], kn+1, t_start2, is2, bfls2, t_end2, ie2, bfle2, intervals[ifl]);
      pair<int,int> ipl2 = intervals[ifl].InsertExponents(t_start2, is2, bfls2, t_end2, ie2, bfle2);
      TMD[ifl].AddUpdate_Gf(intervals[ifl]);
      
      pair<int,int> opt2 = Operators.Add_Cd_C(fls2, t_start2, fle2, t_end2, IntervalIndex(ifl, nIntervals::cd, ipl2.first), IntervalIndex(ifl, nIntervals::c, ipl2.second));
	
      int op_i2 = min(opt2.first, opt2.second);
      int op_f2 = max(opt2.first, opt2.second);
      Number new_matrix_element2 = UpdateStateEvolution(state_evolution_left, state_evolution_right, op_i2, op_f2, Operators, npraStates, cluster, Trace, Prob);
      

      double ms_new = divide(new_matrix_element2,matrix_element);
      if (fabs(fabs(ms)-fabs(ms_new))>1e-6) cerr<<"Matrix element not the same a2!"<<endl;
      double ratioD_new = ratioD1*ratioD2;
      if (fabs(fabs(ratioD_new)-fabs(ratioD))>1e-6) cerr<<"ratioD not the same!"<<endl;
	      
      detD[ifl] *= ratioD1*ratioD2;
      matrix_element = new_matrix_element2;
      successful++;
	    
      ComputeAverage();
    }
  }
}

template <class Rand>
void CTQMC<Rand>::Remove_Two_Kinks(int i, int ifl)
{
  int kn = intervals[ifl].size()/2;
  if (kn<=1) return;
  int dim = cluster.ifl_dim[ifl];
  int ie1 = static_cast<int>(kn*rand());
  int is1 = static_cast<int>(kn*rand());
  int bfle1 = intervals[ifl].btype_e(ie1);
  int bfls1 = intervals[ifl].btype_s(is1);
  int ie2, bfle2;
  do{
    ie2 = static_cast<int>(kn*rand());
    bfle2 = intervals[ifl].btype_e(ie2);
  } while(ie2==ie1);
  int is2, bfls2;
  do{
    is2 = static_cast<int>(kn*rand());
    bfls2 = intervals[ifl].btype_s(is2);
  } while (is2==is1);
  
  int fle1 = cluster.fl_from_ifl[ifl][bfle1];
  int fls1 = cluster.fl_from_ifl[ifl][bfls1];
  int fle2 = cluster.fl_from_ifl[ifl][bfle2];
  int fls2 = cluster.fl_from_ifl[ifl][bfls2];
	  
  int iop_s1 = intervals[ifl].FindSuccessive(0, is1, bfls1);
  int iop_e1 = intervals[ifl].FindSuccessive(1, ie1, bfle1);
  int iop_s2 = intervals[ifl].FindSuccessive(0, is2, bfls2);
  int iop_e2 = intervals[ifl].FindSuccessive(1, ie2, bfle2);
  
  int ipe1, ips1, ipe2, ips2;
  bool ssc1 = Operators.Try_Remove_2_C_2_Cd_(fle1, iop_e1, fls1, iop_s1, fle2, iop_e2, fls2, iop_s2, ipe1, ips1, ipe2, ips2, state_evolution_left, state_evolution_right);
    
  if (ssc1){
    double ratioD = TMD[ifl].RemoveDetRatio(MD[ifl], kn, is1, ie1, is2, ie2);
    int op_i = Operators.mini();
    int op_f = Operators.maxi()-2;
    Operators.Try_Remove_C_Cd(ipe1, ips1, ipe2, ips2);
    Number ms_new = ComputeTrace(state_evolution_left, state_evolution_right, op_i, op_f, Operators, npraStates, cluster, -4);
    double ms = fabs(divide(ms_new,matrix_element));
    double P = (fabs(ratioD)*ms)*sqr(kn/(common::beta*dim))*sqr((kn-1)/(common::beta*dim));
    //	    cout<<"r "<<setw(5)<<i<<" "<<setw(10)<<P<<" "<<setw(10)<<ms<<" "<<setw(10)<<ratioD<<endl;
    if (1-rand()<P && ms>common::minD){
      //      cout<<"r2 "<<setw(5)<<i<<" "<<setw(10)<<P<<" "<<setw(10)<<ms<<" "<<setw(10)<<ratioD<<" "<<kn<<endl;
      Operators.Remove_C_Cd(ipe1, ips1);
      int op_i = min(ipe1, ips1);
      int op_f = ipe1>ips1 ? ipe1-2 : ips1-1;
      Number new_matrix_element1 = UpdateStateEvolution(state_evolution_left, state_evolution_right, op_i, op_f, Operators, npraStates, cluster, Trace, Prob);
      TMD[ifl].RemoveUpdate_Gf(MD[ifl], kn, is1, ie1, intervals[ifl]);
      double ratioD1 = TMD[ifl].RemoveUpdateMatrix(MD[ifl], kn, is1, ie1, intervals[ifl]);
      intervals[ifl].RemoveExponents(is1,ie1);

      Operators.Remove_C_Cd(ipe2, ips2);
      op_i = min(ipe2, ips2);
      op_f = ipe2>ips2 ? ipe2-2 : ips2-1;
      Number new_matrix_element2 = UpdateStateEvolution(state_evolution_left, state_evolution_right, op_i, op_f, Operators, npraStates, cluster, Trace, Prob);
      if (is1<is2) is2--;
      if (ie1<ie2) ie2--;
      TMD[ifl].RemoveUpdate_Gf(MD[ifl], kn-1, is2, ie2, intervals[ifl]);
      double ratioD2 = TMD[ifl].RemoveUpdateMatrix(MD[ifl], kn-1, is2, ie2, intervals[ifl]);
      intervals[ifl].RemoveExponents(is2,ie2);
		
      double ms_new = divide(new_matrix_element2,matrix_element);
      if (fabs(fabs(ms)-fabs(ms_new))>1e-6) cerr<<"Matrix element not the same r2!"<<endl;
      double ratioD_new = ratioD1*ratioD2;
      if (fabs(fabs(ratioD_new)-fabs(ratioD))>1e-6) cerr<<"ratioD not the same!"<<endl;
		
      detD[ifl] *= ratioD1*ratioD2;
      matrix_element = new_matrix_element2;
      successful++;
	    
      ComputeAverage();
    }
  }
}

template <class Rand>
void CTQMC<Rand>::ComputeFinalAverage(function1D<double>& mom0)
{
  cout<<"mom="<<left;
  mom0.resize(cluster.N_unique_fl*cluster.HF_M.size());
  for (int op=0; op<cluster.HF_M.size(); op++){
    for (int fl=0; fl<cluster.N_unique_fl; fl++){
      double sum=0;
      for (int ist=0; ist<npraStates.size(); ist++)
	for (int m=0; m<cluster.msize(ist+1); m++)
	  sum += cluster.HF_M[op][fl][ist+1][m][m]*AProb[ist][m];
      int ind = fl+op*cluster.N_unique_fl;
      mom0[ind] = sum;
      cout<<setw(8)<<mom0[ind]<<" ";
    }
  }
  cout<<endl;
}

template <int boson_ferm>
inline const funProxy<dcomplex>& find_Op_in_intervals(int ip, const vector<nIntervals>& intervals, const NOperators& Operators)
{
  IntervalIndex p_ifl = Operators.iifl(ip);
  int ifl = p_ifl.ifl;
  int itp = p_ifl.type;
  int in = p_ifl.in;

  return intervals[ifl]. template exp_direct<boson_ferm>(itp,in);
  // The following line contains "template" because pre 3.0 gcc had a bug and could not recognize template function in this form.
  // If your compiler supports the line below, please uncomment the line below and comment the line above
  //  return intervals[ifl].exp_direct<boson_ferm>(itp,in); 
}

template <int boson_ferm>// e^{iom*beta}
double e_om_beta(){ return -100;};
template <>
double e_om_beta<0>(){return 1;}// bosonic
template <>
double e_om_beta<1>(){return -1;}// fermionic

template <int boson_ferm>
void ComputeFrequencyPart(const vector<nIntervals>& intervals, const NOperators& Operators, int omsize, function2D<dcomplex>& dexp)
{ // This function computes the following factor:
  //            exp(iom*tau_{l+1})-exp(iom(tau_l)
  //            for frequencies iom sampled and all kink times tau_l.
  int N = Operators.size();
  dexp.resize(N+1, omsize);
  double eom_beta = e_om_beta<boson_ferm>();
  
  if (N==0){
    for (int im=0; im<omsize; im++)  dexp(0,im) = eom_beta-1;// e^{iom*beta}-e^{iom*0}
    return;
  }
  
  {// first kink
    const funProxy<dcomplex>& exp1 = find_Op_in_intervals<boson_ferm>(0, intervals, Operators);
    funProxy<dcomplex>& dexp_ = dexp[0];
    for (int im=0; im<omsize; im++)  dexp_[im] = exp1[im]-1; // exp(iom*0)==1
  }
  for (int ip=1; ip<N; ip++){// most of the kinks
    const funProxy<dcomplex>& exp0 = find_Op_in_intervals<boson_ferm>(ip-1, intervals, Operators);
    const funProxy<dcomplex>& exp1 = find_Op_in_intervals<boson_ferm>(ip,   intervals, Operators);
    funProxy<dcomplex>& dexp_ = dexp[ip];
    for (int im=0; im<omsize; im++)  dexp_[im] = exp1[im]-exp0[im];
  }
  {// after last kink
    const funProxy<dcomplex>& exp0 = find_Op_in_intervals<boson_ferm>(N-1, intervals, Operators);
    funProxy<dcomplex>& dexp_ = dexp[N];
    for (int im=0; im<omsize; im++)  dexp_[im] = eom_beta-exp0[im]; // exp(iom*beta)==+-1
  }
}


void ComputeFrequencyPart_Slow(const NOperators& Operators, const mesh1D& iom, function2D<dcomplex>& bexp, function2D<dcomplex>& dexp)
{ // This function computes the following factor:
  //            exp(iom*tau_{l+1})-exp(iom(tau_l)
  //            for frequencies iom sampled and all kink times tau_l.
  int N = Operators.size();
  int nom = iom.size();
  dexp.resize(N+1, iom.size());

  double eom_beta = cos(iom[0]*common::beta);
    
  if (N==0){
    dexp=0; // e^{iom*beta}-e^{iom*0} ==0
    return;
  }
  // This could be done at every insertion or removal of kink
  // But here it is done for all kinks every time
  for (int ip=0; ip<N; ip++){
    double tau = Operators.t(ip);
    for (int im=0; im<nom; im++){
      double phase = iom[im]*tau;
      bexp(ip,im).Set(cos(phase),sin(phase));
    }
  }

  const funProxy<dcomplex>& exp1 = bexp[0];
  funProxy<dcomplex>& dexp_ = dexp[0];
  for (int im=0; im<nom; im++)  dexp_[im] = exp1[im]-1; // exp(iom*0)==1
  
  for (int ip=1; ip<N; ip++){
    const funProxy<dcomplex>& exp0 = bexp[ip-1];
    const funProxy<dcomplex>& exp1 = bexp[ip];
    funProxy<dcomplex>& dexp_ = dexp[ip];
    for (int im=0; im<nom; im++)  dexp_[im] = exp1[im]-exp0[im];
  }

  {
    const funProxy<dcomplex>& exp0 = bexp[N-1];
    funProxy<dcomplex>& dexp_ = dexp[N];
    for (int im=0; im<nom; im++)  dexp_[im] = eom_beta-exp0[im]; // exp(iom*beta)==1
  }
}
template <class Rand>
void CTQMC<Rand>::ComputeAverage()
{// components of observables:
  // 0 : nf
  // 1 : chi_{zz} = <Sz(t) Sz>
  // 2 : chi(density-density) = <1/2 n(t) 1/2 n>
  // 3 : <Sz>
  if (observables.size()!=4) cerr<<"Resize observables!"<<endl;
  observables=0;

  if (common::SampleSusc){
    susc=0;
    
    //    ComputeFrequencyPart_Slow(Operators, biom, bexp, dexp);
    ComputeFrequencyPart<0>(intervals, Operators, biom.size(), dexp);

  }

  // time differences can be precomputed
  int N = Operators.size();
  dtau.resize(N+1);
  if (N>0){
    dtau[0] = Operators.t(0);
    for (int ip=1; ip<N; ip++) dtau[ip] = Operators.t(ip)-Operators.t(ip-1);
    dtau[N] = common::beta-Operators.t(N-1);
  } else dtau[0] = common::beta;

  deque<int> contrib;
  for (int ist=0; ist<npraStates.size(); ist++){
    // Checking which states do not contribute to trace
    if (state_evolution_left[ist].size()<=Operators.size() || state_evolution_left[ist].last().istate!=npraStates[ist].istate) continue;
    if (npraStates[ist].empty()) continue;
    double P_i = fabs(divide(Trace[ist],matrix_element)); // probability for this state
    if (P_i<1e-10) continue; // Probability negligible

    contrib.push_back(ist);
    int istate = npraStates[ist].istate;    
    double Ns = cluster.Ns[istate];
    double Sz = cluster.Sz[istate];
    double Ns2 = Ns/2.;
    
    double dnf = Ns*dtau[0];
    double dmg = Sz*dtau[0];
 
    if (common::SampleSusc){
      for (int im=1; im<dexp.size_Nd(); im++){
	dsusc(im,0) = dexp[0][im] * Sz;
	dsusc(im,1) = dexp[0][im] * Ns2;
      }
    }
 
    for (int ip=0; ip<Operators.size(); ip++){
      int jstate = state_evolution_left[ist][ip].istate;
      double Ns = cluster.Ns[jstate];
      double Sz = cluster.Sz[jstate];
      double Ns2 = Ns/2.;
      
      dnf += Ns*dtau[ip+1];
      dmg += Sz*dtau[ip+1];

      if (common::SampleSusc){
	const funProxy<dcomplex>& dexp_ = dexp[ip+1];
	for (int im=1; im<dexp.size_Nd(); im++){
	  dsusc(im,0) += dexp_[im] * Sz;
	  dsusc(im,1) += dexp_[im] * Ns2;
	}
      }
    }

    double fact = P_i/common::beta;
    observables[0] += dnf*fact;
    observables[1] += sqr(dmg)*fact;
    observables[2] += sqr(0.5*dnf)*fact;
    observables[3] += dmg*fact;
   
    if (common::SampleSusc){
      for (int im=1; im<susc.size_Nd(); im++){
	susc(0,im) += norm(dsusc(im,0))*fact/sqr(biom[im]);
	susc(1,im) += norm(dsusc(im,1))*fact/sqr(biom[im]);
      }
    }
  }
  susc(0,0) = observables[1];
  susc(1,0) = observables[2];
  
//   // vertex contains the matrix element of the observable
//   // (either Sz or N) at all kinks which contribute to the trace
//   vertex.resize(nsusc*contrib.size(), Operators.size()+1);
//   for (int iq=0; iq<contrib.size(); iq++){
//     int ist = contrib[iq];
//     int istate = npraStates[ist].istate;
//     vertex(nsusc*iq+0, 0) = cluster.Sz[istate];
//     vertex(nsusc*iq+1, 0) = cluster.Ns[istate];
//     for (int ip=0; ip<Operators.size(); ip++){
//       int jstate = state_evolution_left[ist][ip].istate;
//       vertex(nsusc*iq+0, ip+1) = cluster.Sz[jstate];
//       vertex(nsusc*iq+1, ip+1) = cluster.Ns[jstate];
//     }
//   }
//   susc=0;
//   dsusc1.resize(nsusc*contrib.size(), iom.size());
//   //  dsusc1.Product("N", "N", vertex,dexp); // matrix product -> optimized for speed
//   //  cout<<"sizes "<<vertex.size_N()<<" "<<vertex.size_Nd()<<" "<<dexp.size_Nd()<<endl;
//   for (int i=0; i<dsusc1.size_N(); i++){
//     for (int j=0; j<iom.size(); j++){
//       dcomplex sum=0;
//       for (int k=0; k<Operators.size()+1; k++){
// 	sum += vertex(i,k)*dexp(k,j);
//       }
//       dsusc1(i,j) = sum;
//     }
//   }
//   for (int iq=0; iq<contrib.size(); iq++){
//     int ist = contrib[iq];
//     double P_i = fabs(divide(Trace[ist],matrix_element)); // probability for this state
//     double fact = P_i/common::beta;
//     for (int im=0; im<iom.size(); im++){
//       susc(0,im) += norm(dsusc1(nsusc*iq+0,im))*fact/sqr(iom[im]);
//       susc(1,im) += norm(dsusc1(nsusc*iq+1,im))*fact/sqr(iom[im]);
//     }
//   }
}


template <class Rand>
void CTQMC<Rand>::CleanUpdate(int i)
{
  static function2D<dcomplex> backup_Gf(4,iom.size());
  static function2D<double> backup_MD(MD[0].size_N(),MD[0].size_Nd());
  
  for (int ifl=0; ifl<common::N_ifl; ifl++){
    backup_MD = MD[ifl];
	
    TMD[ifl].CleanUpdateMatrix(MD[ifl], intervals[ifl].size()/2, intervals[ifl]);

    for (int il=0; il<MD[ifl].size_N(); il++)
      for (int jl=0; jl<MD[ifl].size_Nd(); jl++)
	if (fabs(backup_MD[il][jl]-MD[ifl][il][jl])>1e-5) cerr<<"Difference in MD at "<<i<<" "<<il<<" "<<jl<<" "<<backup_MD[il][jl]-MD[ifl][il][jl]<<endl;
	
    backup_Gf = TMD[ifl].gf();
	
    TMD[ifl].CleanUpdateGf(MD[ifl], intervals[ifl].size()/2, intervals[ifl]);
	
    for (int il=0; il<backup_Gf.size_N(); il++)
      for (int jl=0; jl<backup_Gf.size_Nd(); jl++)
	if (norm(backup_Gf[il][jl]-TMD[ifl].gf()[il][jl])>1e-5) cerr<<"Difference in Gf at "<<i<<" "<<il<<" "<<jl<<" "<<backup_Gf[il][jl]-TMD[ifl].gf()[il][jl]<<endl;
  }
}

template <class Rand>
void CTQMC<Rand>::StoreCurrentState(function1D<double>& aver_observables)
{
  for (size_t ifl=0; ifl<Gaver.size(); ifl++)  Gaver[ifl] += TMD[ifl].gf();
  
  AProb += Prob;
  aver_observables += observables;

  for (int ifl=0; ifl<kaver.size(); ifl++) kaver[ifl] += intervals[ifl].size()/2;

  if (common::SampleSusc) aver_susc += susc;
  
  Naver++;
}


template <class Rand>
void CTQMC<Rand>::PrintInfo(int i, const function1D<double>& aver_observables)
{
  cout<<setw(9)<<i+1<<"    ";
  double nf = aver_observables[0]/Naver/cluster.Nm;
  double mf = aver_observables[3]/Naver/cluster.Nm;
  cout<<left<<" nf="<<setw(12)<<nf<<"    ";
  cout<<left<<" <Sz>="<<setw(12)<<mf<<"    ";
  double chiS = aver_observables[1]/Naver/cluster.Nm;
  double chiC = aver_observables[2]/Naver/cluster.Nm;
  cout<<left<<" chiS="<<setw(12)<<(chiS-sqr(mf)*common::beta)<<"    ";
  cout<<left<<" chiD="<<setw(12)<<(chiC-sqr(0.5*nf)*common::beta)<<"    ";

  {
    double Epot=0;
    for (int ist=0; ist<npraStates.size(); ist++){
      double dmuN = common::mu*cluster.Ns[ist+1];
      for (int m=0; m<cluster.msize(ist+1); m++)
	Epot += (cluster.Ene[ist+1][m]+dmuN)*AProb[ist][m];
    }
    Epot/=Naver;
    cout<<left<<" Epot="<<setw(10)<<Epot<<" ";

    double kt=0;
    for (int ifl=0; ifl<kaver.size(); ifl++) kt += kaver[ifl];
    kt/=Naver;
    double Ekin = -kt/common::beta;
    cout<<left<<" Ekin="<<setw(10)<<Ekin<<" ";
  }
  
  for (int op=0; op<cluster.HF_M.size(); op++){
    for (int fl=0; fl<cluster.N_unique_fl; fl++){
      
      double sum=0;
      for (int ist=0; ist<npraStates.size(); ist++)
	for (int m=0; m<cluster.msize(ist+1); m++)
	  sum += cluster.HF_M[op][fl][ist+1][m][m]*AProb[ist][m];
      sum/=Naver;

      cout<<setw(8)<<sum<<" ";
    }
  }
  cout<<right<<endl;

  if ((i+1)%common::Naver==0){
    ofstream tout(NameOfFile_("Gaver",common::my_rank,i+1).c_str());
    tout.precision(16);
    for (int im=0; im<iom.size(); im++){
      tout<<setw(20)<<iom[im]<<" ";
      for (int ifl=0; ifl<common::N_ifl; ifl++)
	for (int b=0; b<Gaver[ifl].size_N(); b++)
	  tout<<setw(20)<<Gaver[ifl][b][im]/Naver<<" ";
      tout<<endl;
    }
    ofstream pout(NameOfFile_("Probability",common::my_rank,i+1).c_str());
    pout.precision(16);
    for (int j=0; j<AProb.size_N(); j++)
      for (int k=0; k<AProb[j].size(); k++)
	pout<<setw(3)<<j+1<<" "<<setw(3)<<k<<" "<<setw(20)<<AProb[j][k]/Naver<<endl;
  }
}

template <class Rand>
double CTQMC<Rand>::sample(int max_steps)
{
  Number Zloc = 0;
  for (int i=0; i<Trace.size(); i++) Zloc += Trace[i];
  matrix_element = Zloc;
  
  ComputeAverage();
  
  //  aver_observables.resize(observables.size());
  aver_observables=0;
  
  for (int i=0; i<max_steps; i++){
    //    debug_current_step = i;
    int ifl = cluster.ifl_from_fl[static_cast<int>(common::N_flavors*rand())];
    if (common::PChangeOrder>rand() || i<common::warmup/2){// change order, in warmup one should reach average order
      if (1-rand()>common::TwoKinks){ // One kind add/remove step
	if (rand()>0.5)	Add_One_Kink(i, ifl);
	else Remove_One_Kink(i, ifl);
      }else{                       // Two kink add/remove step
	cout<<"Should not happen since 2k="<<common::TwoKinks<<endl;
	if (rand()>0.5) Add_Two_Kinks(i, ifl);
	else Remove_Two_Kinks(i, ifl);
      }
    }else Move_A_Kink(i, ifl);      // Keep the same number of kinks
    
    if ((i+1)%common::CleanUpdate==0) CleanUpdate(i);
    if (common::GlobalFlip>0 && (i+1)%common::GlobalFlip==0) GlobalFlip(i);
    
    histogram[Operators.size()/2]++;

    if ((i+1)%common::tsample==0 && i>common::warmup) StoreCurrentState(aver_observables);
    if (common::SampleGtau>0 && (i+1)%common::SampleGtau==0){
      for (size_t ifl=0; ifl<TMD.size(); ifl++) TMD[ifl].ComputeGtau(MD[ifl], intervals[ifl], Gtau[ifl]);
      NGta++;
    }

    if ((i+1)%common::Ncout==0 && i>2*common::warmup) PrintInfo(i, aver_observables);
  }
  
  for (size_t ifl=0; ifl<Gaver.size(); ifl++) Gaver[ifl] *=(1./Naver);
  
  kaver *= 1./(Naver);
  aver_observables *= (1./Naver/cluster.Nm);

  aver_observables[1] -= sqr(aver_observables[3])*common::beta;
  aver_observables[2] -= sqr(0.5*aver_observables[0])*common::beta;
  
  if (common::SampleSusc){

    aver_susc *= (1./Naver/cluster.Nm);
    aver_susc[0][0] = aver_observables[1];
    aver_susc[1][0] = aver_observables[2];
    
//     ofstream outs("susc0.dat");
//     outs<<0<<" "<<aver_observables[1]<<" "<<aver_observables[2]<<endl;
//     for (int im=0; im<aver_susc.size_Nd(); im++)
//       outs<<biom[im]<<" "<<aver_susc[0][im]<<" "<<aver_susc[1][im]<<endl;

  }
  
  cout<<"successful="<<successful<<" ratio="<<successful/(max_steps+0.0)<<endl;
  
  if (common::SampleGtau>0){
    double fct = Gtau[0].size_Nd()/(NGta+0.0)/(common::beta*common::beta);
    for (size_t ifl=0; ifl<TMD.size(); ifl++) Gtau[ifl] *= fct;
  }
  return aver_observables[0];
}

template <class Rand>
bool CTQMC<Rand>::SaveStatus(int my_rank)
{
  ofstream out(NameOfFile("status",my_rank).c_str());
  out.precision(16);
  if (!out){cerr<<"Could not save status!"<<endl; return false;}
  out<<"# Status file for ctqmc. Used to save all times t_s, t_e."<<endl;
  out<<intervals.size()<<endl;
  for (size_t ifl=0; ifl<intervals.size(); ifl++){
    out<<intervals[ifl].size()/2<<endl;
    for (int it=0; it<intervals[ifl].size()/2; it++){
      out<<intervals[ifl].time_s(it)<<" "<<intervals[ifl].time_e(it)<<" "<<intervals[ifl].btype_s(it)<<" "<<intervals[ifl].btype_e(it)<<endl;
    }
  }
  return true;
}

template <class Rand>
bool CTQMC<Rand>::RetrieveStatus(int my_rank)
{
  string str;
  ifstream inp(NameOfFile("status",my_rank).c_str());
  if (!inp) return false;
  getline(inp,str); // comment
  int tN_ifl;
  inp>>tN_ifl;
  getline(inp,str);
  if (!inp) return false;
  for (size_t ifl=0; ifl<intervals.size(); ifl++){
    int dim = cluster.ifl_dim[ifl];
    int nt;
    inp>>nt;
    getline(inp,str);
    if (!inp) {clog<<"Only partly restored file status."<<my_rank<<endl; break;}
    if (nt<0 || nt>=intervals[ifl].fullsize()/2){cerr<<"Too many input times. Skipping...."<<endl; continue;}
    for (int it=0; it<nt; it++){
      double t_start, t_end;
      inp>>t_start>>t_end;
      if (t_start<0 || t_start>common::beta || t_end<0 || t_end>common::beta) {cerr<<"The times from starting file are wrong. Skipping..."<<endl; continue;}
      int bfls, bfle;
      inp>>bfls>>bfle;
      if (bfls<0 || bfls>dim || bfle<0 || bfle>dim) {cerr<<"The baths from starting file are wrong. Skipping..."<<endl; continue;}
      getline(inp,str);
      
      int kn = intervals[ifl].size()/2;// should be same as it!
      int fls = cluster.fl_from_ifl[ifl][bfls];
      int fle = cluster.fl_from_ifl[ifl][bfle];
      
      int is, ie;      
      intervals[ifl].Find_is_ie(t_start, is, t_end, ie);// shoule be same as it
      double ratioD = TMD[ifl].AddUpdateMatrix(MD[ifl], kn, t_start, is, bfls, t_end, ie, bfle, intervals[ifl]);
      pair<int,int> ipl = intervals[ifl].InsertExponents(t_start, is, bfls, t_end, ie, bfle);
      TMD[ifl].AddUpdate_Gf(intervals[ifl]);
      
      pair<int,int> opt = Operators.Add_Cd_C(fls, t_start, fle, t_end, IntervalIndex(ifl, nIntervals::cd, ipl.first), IntervalIndex(ifl, nIntervals::c, ipl.second));
      
      int op_i = min(opt.first, opt.second);
      int op_f = max(opt.first, opt.second);
      Number new_matrix_element = UpdateStateEvolution(state_evolution_left, state_evolution_right, op_i, op_f, Operators, npraStates, cluster, Trace, Prob);
      //      clog<<it<<" "<<kn<<" "<<is<<" "<<ie<<endl;
      detD[ifl] *= ratioD;
      matrix_element = new_matrix_element;
    }
  }
  clog<<"Old status retrieved!"<<endl;
  ComputeAverage();
  CleanUpdate(0);
  return true;
}

template <class Rand>
void CTQMC<Rand>::RecomputeCix()
{  cluster.RecomputeCix(AProb,Naver,common::treshold,npraStates);}

template <class Rand>
void CTQMC<Rand>::GlobalFlip(int i)
{
  for (int iu=0; iu<cluster.gflip_ifl.size(); iu++){
    int ifa = cluster.gflip_ifl[iu].first;
    int ifb = cluster.gflip_ifl[iu].second;
    int sizea = cluster.ifl_dim[ifa];
    int sizeb = cluster.ifl_dim[ifb];
    if (sizea!=sizeb) cerr<<"Sizes of symmetric baths are different! This is not supported!"<<endl;
    //    int ka = intervals[ifa].size()/2;
    //    int kb = intervals[ifb].size()/2;

    double ratioDb=1, ratioDa=1;
    if (!cluster.ifl_equivalent[ifa][ifb]){
      ratioDb = TMD[ifa].GlobalFlipDetRatio(intervals[ifa], MD[ifa], Deltat[ifb]);
      ratioDa = TMD[ifb].GlobalFlipDetRatio(intervals[ifb], MD[ifb], Deltat[ifa]);
    }
    Number ms_new = ComputeGlobalTrace(Operators, npraStates, cluster, cluster.gflip_fl[iu]);
    
    double ms = divide(ms_new,matrix_element);
    double P = fabs(ms*ratioDa*ratioDb);
    
    if (P>1-rand()){
      tinterval.copy_data(intervals[ifa]); intervals[ifa].copy_data(intervals[ifb]); intervals[ifb].copy_data(tinterval);//swap
      
      if (cluster.ifl_equivalent[ifa][ifb]){
	tMD = MD[ifa]; MD[ifa] = MD[ifb]; MD[ifb] = tMD;
	swapGf(TMD[ifa], TMD[ifb]);
      }else{
	TMD[ifa].CleanUpdateMatrix(MD[ifa], intervals[ifa].size()/2, intervals[ifa]);
	TMD[ifb].CleanUpdateMatrix(MD[ifb], intervals[ifb].size()/2, intervals[ifb]);
	TMD[ifa].CleanUpdateGf(MD[ifa], intervals[ifa].size()/2, intervals[ifa]);
	TMD[ifb].CleanUpdateGf(MD[ifb], intervals[ifb].size()/2, intervals[ifb]);
      }

      Operators.GlobalFlipChange(cluster.gflip_fl[iu], ifa, ifb);
      
      Number new_matrix_element = UpdateStateEvolution(state_evolution_left, state_evolution_right, 0, Operators.size()-1, Operators, npraStates, cluster, Trace, Prob);
      if (fabs(fabs(divide(ms_new,new_matrix_element))-1)>1e-6) {cerr<<"Matrix elements are not the same in global flip "<<ms_new<<" "<<new_matrix_element<<endl;}
      detD[ifa] *= ratioDa;
      detD[ifb] *= ratioDb;
      matrix_element = new_matrix_element;
      successful++;
      ComputeAverage();
    }
  }
}


  
int main(int argc, char *argv[])
{
  setvbuf (stdout, NULL, _IONBF, BUFSIZ);
  int my_rank, mpi_size, Master;
  MPI_Init(argc, argv, my_rank, mpi_size, Master);
  
  string inputf = "PARAMS";      // filename of the parameter file
  string fDelta   = "Delta.inp"; // input bath function Delta(iom)
  string fcix     = "imp.cix";   // input file with atom eigenvalues and eigenvectors (exact diagonalization results)
  string fGf      = "Gf.out";    // output file for green's function G(iom)
  string fSig     = "Sig.out";   // output file for self-energy Sig(iom)
  double mu       = 0;           // chemical potential
  double beta     = 100;         // inverse temperature
  double U        = 0;           // Coulomb repulsion (should be used only in single site DMFT)
  int    M        = 1000000;     // number of all qmc steps
  int    Ntau     = 500;         // number of imaginary time points for Delta(tau)
  int    Nmax     = 1024;        // maximum number of kinks
  int    nom      = 100;         // number of frequency points sample in qmc
  int    nomb     = 50;          // number of bosonic frequencies for susceptibilities
  double PChangeOrder = 0.9;     // probability to change the order (as compared to move a kink)
  int    tsample  = 10;          // how often to record measurements (each tsample steps)
  int    warmup   = 500000;      // how many qmc steps should be ignored
  int CleanUpdate = 100000;      // clean update is sometimes useful
  double minM     = 1e-10;       // trace shuld always be larger than this minimal value when accepting the step
  double minD     = 1e-10;       // determinant of hybrodization should always be larger than this number when accepting the step
  int    Ncout    = 1000000;       // screen output for basic information every Ncout steps
  int    Naver    = 500000000;     // much more information about simulation every Naver steps
  double TwoKinks = 0.;          // probability to add two kinks
  int    GlobalFlip= -1;         // global flip is tried after GlobalFlip qmc steps
  double treshold = 1e-10;       // energy to throw away atomic states when creating new.cix for next iteration
  int  SampleGtau = -1;          // How often to sample for Gtau (if at all)
  int  ReduceComm = 0;           // only part of Gtau is transfered between nodes (if slow communication)
  int    Ncorrect = -1;          // baths with index higher than Ncorrect should not be added a high-frequency tail (if -1, all tails are added)
  int         aom = 3;           // number of frequency points to find Sigma(om_last_sampled)
  int         som = 3;           // number of frequency points to find Susc(om_last_sampled)
  int    PreciseP = 1;           // computes probabilities more precisely
  double   sderiv = 0.1;         // maximum mismatch when matching high and low frequency of imaginary part of Sigmas
  double minDeltat= 1e-7;        // Delta(tau) is sometimes not causal due to numerical error. In this case we set it to small value minDeltat.
  bool SampleSusc = false;        // If spin and charge dynamic susceptibility should be sampled during simulation
  
  ifstream inp(inputf.c_str());
  if (!inp.good()) {cerr<<"Missing input file!... exiting "<<endl; return 1;}
  while (inp){
    string str;
    inp>>str;
    if (str=="#") inp.ignore(2000,'\n');
    if (str=="Delta")       inp>>fDelta;
    if (str=="cix")         inp>>fcix;
    if (str=="mu")          inp>>mu;
    if (str=="beta")        inp>>beta;
    if (str=="U")           inp>>U;
    if (str=="M")           inp>>M;
    if (str=="Ntau")        inp>>Ntau;
    if (str=="Nmax")        inp>>Nmax;
    if (str=="nom")         inp>>nom;
    if (str=="nomb")         inp>>nomb;
    if (str=="tsample")     inp>>tsample;
    if (str=="warmup")      inp>>warmup;
    if (str=="CleanUpdate") inp>>CleanUpdate;
    if (str=="minM")        inp>>minM;
    if (str=="minD")        inp>>minD;
    if (str=="PChangeOrder")inp>>PChangeOrder;
    if (str=="Ncout")       inp>>Ncout;
    if (str=="Naver")       inp>>Naver;
    if (str=="Gf")          inp>>fGf;
    if (str=="Sig")         inp>>fSig;
    if (str=="TwoKinks")    inp>>TwoKinks;
    if (str=="GlobalFlip")  inp>>GlobalFlip;
    if (str=="treshold")    inp>>treshold;
    if (str=="SampleGtau")  inp>>SampleGtau;
    if (str=="ReduceComm")  inp>>ReduceComm;
    if (str=="Ncorrect")    inp>>Ncorrect;
    if (str=="aom")         inp>>aom;
    if (str=="som")         inp>>som;
    if (str=="PreciseP")    inp>>PreciseP;
    if (str=="sderiv")      inp>>sderiv;
    if (str=="minDeltat")   inp>>minDeltat;
    if (str=="SampleSusc")  inp>>SampleSusc;
  }
  clog<<"Input parameters are:"<<endl;
  clog<<"fDelta="<<fDelta<<endl;
  clog<<"fcix="<<fcix<<endl;
  clog<<"mu="<<mu<<endl;
  clog<<"beta="<<beta<<endl;
  clog<<"U="<<U<<endl;
  clog<<"M="<<M<<endl;
  clog<<"Ntau="<<Ntau<<endl;
  clog<<"Nmax="<<Nmax<<endl;
  clog<<"nom="<<nom<<endl;
  clog<<"nomb="<<nomb<<endl;
  clog<<"PChangeOrder"<<PChangeOrder<<endl;
  clog<<"tsample="<<tsample<<endl;
  clog<<"warmup="<<warmup<<endl;
  clog<<"CleanUpdate="<<CleanUpdate<<endl;
  clog<<"minM="<<minM<<endl;
  clog<<"minD="<<minD<<endl;
  clog<<"Ncout="<<Ncout<<endl;
  clog<<"Naver="<<Naver<<endl;
  clog<<"Gf="<<fGf<<endl;
  clog<<"Sig="<<fSig<<endl;
  clog<<"TwoKinks="<<TwoKinks<<endl;
  clog<<"GlobalFlip="<<GlobalFlip<<endl;
  clog<<"treshold="<<treshold<<endl;
  clog<<"SampleGtau="<<SampleGtau<<endl;
  clog<<"ReduceComm="<<ReduceComm<<endl;
  clog<<"Ncorrect="<<Ncorrect<<endl;
  clog<<"aom="<<aom<<endl;
  clog<<"som="<<som<<endl;
  clog<<"PreciseP="<<PreciseP<<endl;
  clog<<"sderiv="<<sderiv<<endl;
  clog<<"minDeltat="<<minDeltat<<endl;
  clog<<"SampleSusc="<<SampleSusc<<endl;
  ClusterData cluster;
  ifstream incix(fcix.c_str());
  cluster.ParsInput(incix);
  cluster.EvaluateMa(beta,mu,U);
  
  common::SetParameters(my_rank,mpi_size,mu,U,beta,cluster.max_size,cluster.N_flavors,cluster.N_ifl,tsample,warmup,
			CleanUpdate,minM,minD,PChangeOrder,Ncout,Naver,TwoKinks,GlobalFlip,treshold,SampleGtau,PreciseP,
			minDeltat,SampleSusc);

  ifstream input(fDelta.c_str());
  int n, m;
  if (!CheckStream(input, n, m)) { cerr<<"Something wrong in reading input Delta="<<fDelta<<endl; return 1;}
  
  mesh1D iom_large;
  function2D<dcomplex> Deltaw;

  vector<double> ah;
  if (!ReadDelta(cluster.N_unique_fl, input, n, m, iom_large, Deltaw, ah, common::beta)){ cerr<<"Something wrong in reading input Delta="<<fDelta<<endl; return 1;}

  //  int iseed = 0;
  int iseed = time(0)+my_rank*10;
  RanGSL rand(iseed);
  cout<<"iseed="<<iseed<<endl;
  
  CTQMC<RanGSL> ctqmc(rand, cluster, Nmax, iom_large, Deltaw, ah, nom, nomb, Ntau, my_rank==Master);

  // Main part of the code : sampling
  ctqmc.RetrieveStatus(my_rank);
  double nf = ctqmc.sample(M);
  ctqmc.SaveStatus(my_rank);

  // Green's function is compressed to few components only
  function2D<dcomplex> Gd(cluster.N_unique_fl,ctqmc.ioms().size());
  Gd=0;
  for (int im=0; im<ctqmc.ioms().size(); im++){
    dcomplex iomega(0,ctqmc.ioms()[im]);
    for (int ifl=0; ifl<cluster.N_ifl; ifl++){
      for (int ib=0; ib<cluster.N_baths(ifl); ib++){
	dcomplex gf = ctqmc.G_aver()[ifl](ib,im);
	if (cluster.conjg[ifl][ib]) gf = gf.conj();
	gf *= cluster.sign[ifl][ib];
	Gd(cluster.bfl_index[ifl][ib],im) += gf;
      }
    }
    for (int fl=0; fl<cluster.N_unique_fl; fl++) Gd(fl,im) /= cluster.fl_deg[fl];
  }
  // G(tau) is also compressed to few components only
  // the number of time slices is reduces to first and last 3 in case ReduceComm is set to true
  // In this case, full Gtau will not be exchaned or printed, but only 6 times will be printed
  // This is done to prevent SIGFAULT in parallel execution due to low communication pipeline
  int ntau = ctqmc.gtau()[0].size_Nd();
  function2D<double> Gtau;
  function1D<int> tau_ind;
  if (!ReduceComm || ntau<6){
    Gtau.resize(cluster.N_unique_fl,ntau);
    tau_ind.resize(ntau);
    for (int it=0; it<tau_ind.size(); it++) tau_ind[it]=it;
  }  else {
    Gtau.resize(cluster.N_unique_fl,6);
    tau_ind.resize(6);
    for (int it=0; it<3; it++) tau_ind[it] = it;
    for (int it=0; it<3; it++) tau_ind[6-it-1] = ntau-it-1;
  }
  if (common::SampleGtau>0){
    //    double dt = common::beta/ntau;
    Gtau=0;
    for (int iti=0; iti<tau_ind.size(); iti++){
      int it = tau_ind[iti];
      for (int ifl=0; ifl<cluster.N_ifl; ifl++){
	for (int ib=0; ib<cluster.N_baths(ifl); ib++){
	  double gf = ctqmc.gtau()[ifl](ib,it);
	  if (cluster.conjg[ifl][ib]) gf = -ctqmc.gtau()[ifl](ib,ntau-it-1);
	  gf *= cluster.sign[ifl][ib];
	  Gtau(cluster.bfl_index[ifl][ib],iti) += gf;
	}
      }
      for (int fl=0; fl<cluster.N_unique_fl; fl++) Gtau(fl,iti) /= cluster.fl_deg[fl];
    }
  }
  
  Reduce(my_rank, Master, mpi_size, ctqmc.Histogram(), Gd, ctqmc.AverageProbability(), ctqmc.Observables(), ctqmc.k_aver(), ctqmc.Susc(), Gtau);
  
  if (my_rank==Master){
    ctqmc.RecomputeCix();
    function1D<double> mom;
    ctqmc.ComputeFinalAverage(mom);
    cout<<"nf="<<ctqmc.Observables()[0]<<" chiS="<<ctqmc.Observables()[1]<<" chiD="<<ctqmc.Observables()[2]<<endl;
    function2D<dcomplex> Sigma(cluster.N_unique_fl,iom_large.size()), Gf(cluster.N_unique_fl,iom_large.size());
    function2D<dcomplex> Gf_1(cluster.N_unique_fl,iom_large.size());
    // Computes the high frequency part of Green's function and self-energy 
    Gf=0;
    Gf_1=0;
    for (int im=0; im<ctqmc.ioms().size(); im++){
      dcomplex iomega(0,ctqmc.ioms()[im]);
      for (int fl=0; fl<cluster.N_unique_fl; fl++) Gf(fl,im) = Gd(fl, im);
      for (int ifl=0; ifl<cluster.N_ifl; ifl++) Inverse_Gf(cluster.ifl_dim[ifl], cluster.N_unique_fl, cluster.bfl_index[ifl], cluster.sign[ifl], cluster.conjg[ifl], im, Gf, Gf_1);
      for (int fl=0; fl<cluster.N_unique_fl; fl++) Gf_1(fl,im) /= cluster.fl_deg[fl];
      for (int fl=0; fl<cluster.N_unique_fl; fl++) Sigma(fl,im) = (iomega+mu)*cluster.Id[fl]-cluster.epsk[fl]-Gf_1(fl,im)-Deltaw(fl,im);
    }
    function2D<double> nt(2,cluster.N_unique_fl);
    nt=0;
    double dt = common::beta/ntau;
    if (common::SampleGtau>0){
      int nti = tau_ind.size();
      for (int fl=0; fl<cluster.N_unique_fl; fl++){
	nt[0][fl] = expin(0.0,make_pair(0.5*dt,Gtau[fl][0]),make_pair(1.5*dt,Gtau[fl][1]));
	nt[1][fl] = expin(common::beta,make_pair((ntau-0.5)*dt,Gtau[fl][nti-1]),make_pair((ntau-1.5)*dt,Gtau[fl][nti-2]));
	cout<<"nt[0]["<<fl<<"]="<<nt[0][fl]<<endl;
	cout<<"nt[1]["<<fl<<"]="<<nt[1][fl]<<endl;
      }
    }
    if (cluster.QHB1){
      cluster.Read_for_HB1(incix, mu, U);
      cluster.HB1(beta, mu, iom_large, ctqmc.AverageProbability(), nom, aom, Deltaw, Sigma);
    }else{
      ComputeMoments(cluster, iom_large, ctqmc.ioms().size(), nf, mom, ctqmc.k_aver(), nt, Sigma, true, Ncorrect, aom, sderiv);
    }
    Gf=0;
    Gf_1=0;
    for (int im=0; im<iom_large.size(); im++){
      dcomplex iomega(0,iom_large[im]);
      for (int fl=0; fl<cluster.N_unique_fl; fl++) Gf_1(fl,im) = (iomega+mu)*cluster.Id[fl]-cluster.epsk[fl]-Sigma(fl,im)-Deltaw(fl,im);
      for (int ifl=0; ifl<cluster.N_ifl; ifl++) Inverse_Gf(cluster.ifl_dim[ifl], cluster.N_unique_fl, cluster.bfl_index[ifl], cluster.sign[ifl], cluster.conjg[ifl], im, Gf_1, Gf);
      for (int fl=0; fl<cluster.N_unique_fl; fl++) Gf(fl,im) /= cluster.fl_deg[fl];
    }
    
    ofstream gout(fGf.c_str()); gout.precision(16);
    gout<<"# nf="<<nf<<" mu="<<mu<<" T="<<1/beta<<endl;
    for (int im=0; im<iom_large.size(); im++){
      gout<<iom_large[im]<<" ";
      for (int fl=0; fl<Gf.size_N(); fl++) gout<<Gf(fl,im)<<" ";
      gout<<endl;
    }
    ofstream sout(fSig.c_str()); sout.precision(16);
    sout<<"# nf="<<nf<<" mu="<<mu<<" T="<<1/beta<<endl;
    for (int im=0; im<iom_large.size(); im++){
      sout<<iom_large[im]<<" ";
      for (int fl=0; fl<Sigma.size_N(); fl++) sout<<Sigma(fl,im)<<" ";
      sout<<endl;
    }
    double sum=0;
    for (int i=0; i<ctqmc.Histogram().size(); i++) sum += ctqmc.Histogram()[i];
    for (int i=0; i<ctqmc.Histogram().size(); i++) ctqmc.Histogram()[i]/=sum;
    ofstream hout("histogram.dat");
    hout<<"# nf="<<nf<<endl;
    for (int i=0; i<ctqmc.Histogram().size(); i++)
      hout<<setw(3)<<i<<"  "<<setw(10)<<ctqmc.Histogram()[i]<<endl;


    if (common::SampleGtau>0){
      ofstream out("Gtau.dat"); out.precision(16);
      double dt = common::beta/ntau;
      for (int iti=0; iti<Gtau.size_Nd(); iti++){
	int it = tau_ind[iti];
	out<<setw(20)<<dt*(it+0.5)<<" ";
	for (int fl=0; fl<cluster.N_unique_fl; fl++){
	  out<<setw(25)<<Gtau[fl][iti]<<" ";
	}
	out<<endl;
      }
    }

    if (common::SampleSusc){
      ofstream out("susc.dat"); out.precision(16);
      int nsusc = ctqmc.Susc().size_N();
      int nom_s = ctqmc.iomb().size();
      int nom_l = iom_large.size();
      out<<0.0<<" "<<ctqmc.Observables()[1]<<" "<<ctqmc.Observables()[2]<<endl;
      for (int im=0; im<nom_s; im++){
	out<<ctqmc.iomb()[im]<<" ";
	for (int l=0; l<nsusc; l++) out<<ctqmc.Susc()(l,im)<<" ";
	out<<endl;
      }
      function1D<double> alpha_susc(nsusc);
      alpha_susc=0;
      for (int im=nom_s-1; im>nom_s-1-som; im--){
	for (int l=0; l<nsusc; l++) alpha_susc[l] += ctqmc.Susc()(l,im);
      }
      double om2 = sqr(ctqmc.iomb()[nom_s-1-som/2]);
      alpha_susc *= om2/som;
      for (int im=nom_s; im<nom_l; im++){
	double omega = 2*im*M_PI/common::beta;
	out<<omega<<" ";
	for (int l=0; l<nsusc; l++) out<<alpha_susc[l]/sqr(omega)<<" ";
	out<<endl;
      }
    }
  }
  MPI_finalize();
  return 0;
}
double common::beta;
double common::U;
double common::mu;
int common::max_size;
int common::N_flavors;
double common::minM;
double common::minD;
int common::tsample;
int common::warmup;
int common::CleanUpdate;
double common::PChangeOrder;
int common::Ncout;
int common::Naver;
const int nIntervals::c;
const int nIntervals::cd;
int common::my_rank;
int common::mpi_size;
int common::N_ifl;
double common::TwoKinks;
double common::treshold;
int common::GlobalFlip;
int common::SampleGtau;
int common::PreciseP;
double common::minDeltat;
bool common::SampleSusc;
