diff --git a/.clang-format b/.clang-format new file mode 100644 index 0000000..091dcf2 --- /dev/null +++ b/.clang-format @@ -0,0 +1,41 @@ +# https://clang.llvm.org/docs/ClangFormatStyleOptions.html +BasedOnStyle: LLVM + +# indent 4 spaces +IndentWidth: 4 + +# break before function +#BreakBeforeBraces: Linux +BreakBeforeBinaryOperators: NonAssignment +#BreakBeforeTernaryOperators: true +BreakConstructorInitializers: AfterColon +AllowShortFunctionsOnASingleLine: false +# Pas de saut de ligne entre type de retour et fonction +AlwaysBreakAfterDefinitionReturnType: None +PenaltyReturnTypeOnItsOwnLine: 300 +BreakBeforeBraces: Custom +BraceWrapping: + AfterClass: true + AfterControlStatement: true + AfterEnum: true + AfterFunction: true + AfterNamespace: true + AfterObjCDeclaration: true + AfterStruct: true + AfterUnion: true + AfterExternBlock: false + BeforeCatch: false + BeforeElse: true + IndentBraces: false + SplitEmptyFunction: true + SplitEmptyRecord: true + SplitEmptyNamespace: true + +ColumnLimit: 180 + +# arguments and parameters in same line or one line for each +BinPackArguments: false +BinPackParameters: false + +# keep up to two empty lines +MaxEmptyLinesToKeep: 2 \ No newline at end of file diff --git a/baumwelch.cc b/baumwelch.cc index 631a2f7..b5c80dd 100644 --- a/baumwelch.cc +++ b/baumwelch.cc @@ -5,7 +5,7 @@ baumWelch::baumWelch(obsSequence *data) mObsSequence = data; } -void baumWelch::computeBaumWelch(hmmResults *results ,modelProb *model) +void baumWelch::computeBaumWelch(hmmResults *results, modelProb *model) { mHmmResults = results; mModelProb = model; @@ -13,81 +13,92 @@ void baumWelch::computeBaumWelch(hmmResults *results ,modelProb *model) ////////////////////// // maximum likelihood estimation of the transition and emission probabilities - map pseudo_count_obsD_int ; - map pseudo_count_obsD_ext ; - map pseudo_count_obsA_int ; - map pseudo_count_obsA_ext ; - map total_pseudo_count_obs_int ; - map total_pseudo_count_obs_ext ; - //map total_pseudo_count_obs_ext ; - - map proba_obsD_int ; - map proba_obsD_ext ; - - for ( int i = 1 ; i < mObsSequence->size()-1 ; ++i ) { - - if (mObsSequence->at(i+1).getUlindi() == 'D'){ - pseudo_count_obsD_int[mObsSequence->at(i+1).getDerived()] += mHmmResults->mfwd_stateI_scaled[i+1] * mHmmResults->mbwd_stateI_scaled[i+1] ; - pseudo_count_obsD_ext[mObsSequence->at(i+1).getDerived()] += mHmmResults->mfwd_stateE_scaled[i+1] * mHmmResults->mbwd_stateE_scaled[i+1] ; - } - else { - pseudo_count_obsA_int[mObsSequence->at(i+1).getDerived()] += mHmmResults->mfwd_stateI_scaled[i+1] * mHmmResults->mbwd_stateI_scaled[i+1] ; - pseudo_count_obsA_ext[mObsSequence->at(i+1).getDerived()] += mHmmResults->mfwd_stateE_scaled[i+1] * mHmmResults->mbwd_stateE_scaled[i+1] ; - } + map pseudo_count_obsD_int; + map pseudo_count_obsD_ext; + map pseudo_count_obsA_int; + map pseudo_count_obsA_ext; + map total_pseudo_count_obs_int; + map total_pseudo_count_obs_ext; + // map total_pseudo_count_obs_ext ; + + map proba_obsD_int; + map proba_obsD_ext; + + for (int i = 1; i < mObsSequence->size() - 1; ++i) + { + + if (mObsSequence->at(i + 1).getUlindi() == 'D') + { + pseudo_count_obsD_int[mObsSequence->at(i + 1).getDerived()] += mHmmResults->mfwd_stateI_scaled[i + 1] * mHmmResults->mbwd_stateI_scaled[i + 1]; + pseudo_count_obsD_ext[mObsSequence->at(i + 1).getDerived()] += mHmmResults->mfwd_stateE_scaled[i + 1] * mHmmResults->mbwd_stateE_scaled[i + 1]; + } + else + { + pseudo_count_obsA_int[mObsSequence->at(i + 1).getDerived()] += mHmmResults->mfwd_stateI_scaled[i + 1] * mHmmResults->mbwd_stateI_scaled[i + 1]; + pseudo_count_obsA_ext[mObsSequence->at(i + 1).getDerived()] += mHmmResults->mfwd_stateE_scaled[i + 1] * mHmmResults->mbwd_stateE_scaled[i + 1]; + } } - for ( std::map::iterator it = pseudo_count_obsD_int.begin(); it != pseudo_count_obsD_int.end(); ++it ) { - int derived = it->first ; - total_pseudo_count_obs_int[derived] += pseudo_count_obsD_int[derived] ; - //total_pseudo_count_obs_ext[derived] += pseudo_count_obsD_ext[derived] ; - if (derived != pseudo_count_obsD_int.rbegin()->first ){ - total_pseudo_count_obs_ext["SNPs"] += pseudo_count_obsD_ext[derived]; - } - else { - total_pseudo_count_obs_ext["fixed"] += pseudo_count_obsD_ext[derived]; - } + for (std::map::iterator it = pseudo_count_obsD_int.begin(); it != pseudo_count_obsD_int.end(); ++it) + { + int derived = it->first; + total_pseudo_count_obs_int[derived] += pseudo_count_obsD_int[derived]; + // total_pseudo_count_obs_ext[derived] += pseudo_count_obsD_ext[derived] ; + if (derived != pseudo_count_obsD_int.rbegin()->first) + { + total_pseudo_count_obs_ext["SNPs"] += pseudo_count_obsD_ext[derived]; + } + else + { + total_pseudo_count_obs_ext["fixed"] += pseudo_count_obsD_ext[derived]; + } } - for ( std::map::iterator it = pseudo_count_obsA_int.begin(); it != pseudo_count_obsA_int.end(); ++it ) { - int derived = it->first ; - total_pseudo_count_obs_int[derived] += pseudo_count_obsA_int[derived] ; - //total_pseudo_count_obs_ext[derived] += pseudo_count_obsA_ext[derived] ; - if (derived != pseudo_count_obsD_int.rbegin()->first ){ - total_pseudo_count_obs_ext["SNPs"] += pseudo_count_obsA_ext[derived]; - } - else { - total_pseudo_count_obs_ext["fixed"] += pseudo_count_obsA_ext[derived]; - } + for (std::map::iterator it = pseudo_count_obsA_int.begin(); it != pseudo_count_obsA_int.end(); ++it) + { + int derived = it->first; + total_pseudo_count_obs_int[derived] += pseudo_count_obsA_int[derived]; + // total_pseudo_count_obs_ext[derived] += pseudo_count_obsA_ext[derived] ; + if (derived != pseudo_count_obsD_int.rbegin()->first) + { + total_pseudo_count_obs_ext["SNPs"] += pseudo_count_obsA_ext[derived]; + } + else + { + total_pseudo_count_obs_ext["fixed"] += pseudo_count_obsA_ext[derived]; + } } - for ( std::map::iterator it = total_pseudo_count_obs_int.begin(); it != total_pseudo_count_obs_int.end(); ++it ) { - int derived = it->first ; - proba_obsD_int[derived] = pseudo_count_obsD_int[derived] / total_pseudo_count_obs_int[derived] ; - if (derived != total_pseudo_count_obs_int.rbegin()->first ) { - proba_obsD_ext["SNPs"] += pseudo_count_obsD_ext[derived] / total_pseudo_count_obs_ext["SNPs"] ; + for (std::map::iterator it = total_pseudo_count_obs_int.begin(); it != total_pseudo_count_obs_int.end(); ++it) + { + int derived = it->first; + proba_obsD_int[derived] = pseudo_count_obsD_int[derived] / total_pseudo_count_obs_int[derived]; + if (derived != total_pseudo_count_obs_int.rbegin()->first) + { + proba_obsD_ext["SNPs"] += pseudo_count_obsD_ext[derived] / total_pseudo_count_obs_ext["SNPs"]; // proba_obsD_ext["SNPs"] = proba_obsD_ext["SNPs"] * ( pseudo_count_obsD_ext[derived] / total_pseudo_count_obs_ext[derived] ) ; - } - else { - proba_obsD_ext["fixed"] = pseudo_count_obsD_ext[derived] / total_pseudo_count_obs_ext["fixed"] ; - //proba_obsD_ext["fixed"] = pseudo_count_obsD_ext[derived] / total_pseudo_count_obs_ext[derived] ; - } + } + else + { + proba_obsD_ext["fixed"] = pseudo_count_obsD_ext[derived] / total_pseudo_count_obs_ext["fixed"]; + // proba_obsD_ext["fixed"] = pseudo_count_obsD_ext[derived] / total_pseudo_count_obs_ext[derived] ; + } } // Update the emission probabilities mModelProb->setProbFixDInternal(proba_obsD_int.rbegin()->second); mModelProb->setProbFixDExternal(proba_obsD_ext["fixed"]); - //for ( int i=1; i::iterator it = proba_obsD_int.begin(); it != proba_obsD_int.end(); ++it) + // for ( int i=1; i::iterator it = proba_obsD_int.begin(); it != proba_obsD_int.end(); ++it) { - int i = it->first ; - mModelProb->setProbSegregDInternalAt(i,proba_obsD_int[i]); + int i = it->first; + mModelProb->setProbSegregDInternalAt(i, proba_obsD_int[i]); } - //mModelProb->setProbSegregDExternal(proba_obsD_ext["SNPs"]); - + // mModelProb->setProbSegregDExternal(proba_obsD_ext["SNPs"]); } -void baumWelch::computeBaumWelch3states(hmmResults *results ,modelProb *model) +void baumWelch::computeBaumWelch3states(hmmResults *results, modelProb *model) { mHmmResults = results; mModelProb = model; @@ -95,76 +106,89 @@ void baumWelch::computeBaumWelch3states(hmmResults *results ,modelProb *model) ////////////////////// // maximum likelihood estimation of the transition and emission probabilities - map pseudo_count_obsD_int ; - map pseudo_count_obsD_ext ; - map pseudo_count_obsA_int ; - map pseudo_count_obsA_ext ; - map total_pseudo_count_obs_int ; - map total_pseudo_count_obs_ext ; - //map total_pseudo_count_obs_ext ; - - map proba_obsD_int ; - map proba_obsD_ext ; - - for ( int i = 1 ; i < mObsSequence->size()-1 ; ++i ) { - - if (mObsSequence->at(i+1).getUlindi() == 'D'){ - pseudo_count_obsD_int[mObsSequence->at(i+1).getDerived()] += mHmmResults->mfwd_stateI_scaled[i+1] * mHmmResults->mbwd_stateI_scaled[i+1] ; - pseudo_count_obsD_ext[mObsSequence->at(i+1).getDerived()] += mHmmResults->mfwd_stateE_scaled[i+1] * mHmmResults->mbwd_stateE_scaled[i+1] + mHmmResults->mfwd_stateLE_scaled[i+1] * mHmmResults->mbwd_stateLE_scaled[i+1]; - } - else { - pseudo_count_obsA_int[mObsSequence->at(i+1).getDerived()] += mHmmResults->mfwd_stateI_scaled[i+1] * mHmmResults->mbwd_stateI_scaled[i+1] ; - pseudo_count_obsA_ext[mObsSequence->at(i+1).getDerived()] += mHmmResults->mfwd_stateE_scaled[i+1] * mHmmResults->mbwd_stateE_scaled[i+1] + mHmmResults->mfwd_stateLE_scaled[i+1] * mHmmResults->mbwd_stateLE_scaled[i+1]; - } + map pseudo_count_obsD_int; + map pseudo_count_obsD_ext; + map pseudo_count_obsA_int; + map pseudo_count_obsA_ext; + map total_pseudo_count_obs_int; + map total_pseudo_count_obs_ext; + // map total_pseudo_count_obs_ext ; + + map proba_obsD_int; + map proba_obsD_ext; + + for (int i = 1; i < mObsSequence->size() - 1; ++i) + { + + if (mObsSequence->at(i + 1).getUlindi() == 'D') + { + pseudo_count_obsD_int[mObsSequence->at(i + 1).getDerived()] += mHmmResults->mfwd_stateI_scaled[i + 1] * mHmmResults->mbwd_stateI_scaled[i + 1]; + pseudo_count_obsD_ext[mObsSequence->at(i + 1).getDerived()] += + mHmmResults->mfwd_stateE_scaled[i + 1] * mHmmResults->mbwd_stateE_scaled[i + 1] + mHmmResults->mfwd_stateLE_scaled[i + 1] * mHmmResults->mbwd_stateLE_scaled[i + 1]; + } + else + { + pseudo_count_obsA_int[mObsSequence->at(i + 1).getDerived()] += mHmmResults->mfwd_stateI_scaled[i + 1] * mHmmResults->mbwd_stateI_scaled[i + 1]; + pseudo_count_obsA_ext[mObsSequence->at(i + 1).getDerived()] += + mHmmResults->mfwd_stateE_scaled[i + 1] * mHmmResults->mbwd_stateE_scaled[i + 1] + mHmmResults->mfwd_stateLE_scaled[i + 1] * mHmmResults->mbwd_stateLE_scaled[i + 1]; + } } - for ( std::map::iterator it = pseudo_count_obsD_int.begin(); it != pseudo_count_obsD_int.end(); ++it ) { - int derived = it->first ; - total_pseudo_count_obs_int[derived] += pseudo_count_obsD_int[derived] ; - //total_pseudo_count_obs_ext[derived] += pseudo_count_obsD_ext[derived] ; - if (derived != pseudo_count_obsD_int.rbegin()->first ){ - total_pseudo_count_obs_ext["SNPs"] += pseudo_count_obsD_ext[derived]; - } - else { - total_pseudo_count_obs_ext["fixed"] += pseudo_count_obsD_ext[derived]; - } + for (std::map::iterator it = pseudo_count_obsD_int.begin(); it != pseudo_count_obsD_int.end(); ++it) + { + int derived = it->first; + total_pseudo_count_obs_int[derived] += pseudo_count_obsD_int[derived]; + // total_pseudo_count_obs_ext[derived] += pseudo_count_obsD_ext[derived] ; + if (derived != pseudo_count_obsD_int.rbegin()->first) + { + total_pseudo_count_obs_ext["SNPs"] += pseudo_count_obsD_ext[derived]; + } + else + { + total_pseudo_count_obs_ext["fixed"] += pseudo_count_obsD_ext[derived]; + } } - for ( std::map::iterator it = pseudo_count_obsA_int.begin(); it != pseudo_count_obsA_int.end(); ++it ) { - int derived = it->first ; - total_pseudo_count_obs_int[derived] += pseudo_count_obsA_int[derived] ; - //total_pseudo_count_obs_ext[derived] += pseudo_count_obsA_ext[derived] ; - if (derived != pseudo_count_obsD_int.rbegin()->first ){ - total_pseudo_count_obs_ext["SNPs"] += pseudo_count_obsA_ext[derived]; - } - else { - total_pseudo_count_obs_ext["fixed"] += pseudo_count_obsA_ext[derived]; - } + for (std::map::iterator it = pseudo_count_obsA_int.begin(); it != pseudo_count_obsA_int.end(); ++it) + { + int derived = it->first; + total_pseudo_count_obs_int[derived] += pseudo_count_obsA_int[derived]; + // total_pseudo_count_obs_ext[derived] += pseudo_count_obsA_ext[derived] ; + if (derived != pseudo_count_obsD_int.rbegin()->first) + { + total_pseudo_count_obs_ext["SNPs"] += pseudo_count_obsA_ext[derived]; + } + else + { + total_pseudo_count_obs_ext["fixed"] += pseudo_count_obsA_ext[derived]; + } } - for ( std::map::iterator it = total_pseudo_count_obs_int.begin(); it != total_pseudo_count_obs_int.end(); ++it ) { - int derived = it->first ; - proba_obsD_int[derived] = pseudo_count_obsD_int[derived] / total_pseudo_count_obs_int[derived] ; - if (derived != total_pseudo_count_obs_int.rbegin()->first ) { - proba_obsD_ext["SNPs"] += pseudo_count_obsD_ext[derived] / total_pseudo_count_obs_ext["SNPs"] ; + for (std::map::iterator it = total_pseudo_count_obs_int.begin(); it != total_pseudo_count_obs_int.end(); ++it) + { + int derived = it->first; + proba_obsD_int[derived] = pseudo_count_obsD_int[derived] / total_pseudo_count_obs_int[derived]; + if (derived != total_pseudo_count_obs_int.rbegin()->first) + { + proba_obsD_ext["SNPs"] += pseudo_count_obsD_ext[derived] / total_pseudo_count_obs_ext["SNPs"]; // proba_obsD_ext["SNPs"] = proba_obsD_ext["SNPs"] * ( pseudo_count_obsD_ext[derived] / total_pseudo_count_obs_ext[derived] ) ; - } - else { - proba_obsD_ext["fixed"] = pseudo_count_obsD_ext[derived] / total_pseudo_count_obs_ext["fixed"] ; - //proba_obsD_ext["fixed"] = pseudo_count_obsD_ext[derived] / total_pseudo_count_obs_ext[derived] ; - } + } + else + { + proba_obsD_ext["fixed"] = pseudo_count_obsD_ext[derived] / total_pseudo_count_obs_ext["fixed"]; + // proba_obsD_ext["fixed"] = pseudo_count_obsD_ext[derived] / total_pseudo_count_obs_ext[derived] ; + } } // Update the emission probabilities mModelProb->setProbFixDInternal(proba_obsD_int.rbegin()->second); mModelProb->setProbFixDExternal(proba_obsD_ext["fixed"]); - //for ( int i=1; i::iterator it = proba_obsD_int.begin(); it != proba_obsD_int.end(); ++it) + // for ( int i=1; i::iterator it = proba_obsD_int.begin(); it != proba_obsD_int.end(); ++it) { - int i = it->first ; - mModelProb->setProbSegregDInternalAt(i,proba_obsD_int[i]); + int i = it->first; + mModelProb->setProbSegregDInternalAt(i, proba_obsD_int[i]); } - //mModelProb->setProbSegregDExternal(proba_obsD_ext["SNPs"]); - + // mModelProb->setProbSegregDExternal(proba_obsD_ext["SNPs"]); } diff --git a/baumwelch.h b/baumwelch.h index 31b8c56..da71b0d 100644 --- a/baumwelch.h +++ b/baumwelch.h @@ -1,33 +1,32 @@ #ifndef BAUMWELCH_H #define BAUMWELCH_H +#include #include -#include +#include +#include #include #include -#include #include -#include -#include +#include -#include "obsData.h" -#include "modelprob.h" #include "hmmresults.h" +#include "modelprob.h" +#include "obsData.h" -using namespace std ; +using namespace std; class baumWelch { - public: - baumWelch(obsSequence *data); - void computeBaumWelch(hmmResults *results, modelProb *model ); - void computeBaumWelch3states(hmmResults *results, modelProb *model ); + public: + baumWelch(obsSequence *data); + void computeBaumWelch(hmmResults *results, modelProb *model); + void computeBaumWelch3states(hmmResults *results, modelProb *model); - protected: - obsSequence *mObsSequence; - hmmResults *mHmmResults; - modelProb *mModelProb; + protected: + obsSequence *mObsSequence; + hmmResults *mHmmResults; + modelProb *mModelProb; }; #endif // BAUMWELCH_H - diff --git a/hillclimbing.cc b/hillclimbing.cc index 54dc37e..cbd85da 100644 --- a/hillclimbing.cc +++ b/hillclimbing.cc @@ -1,35 +1,38 @@ #include "hillclimbing.h" #include -double wrapObjFunc(const std::vector &x, std::vector &grad, void* f_data) +double wrapObjFunc(const std::vector &x, std::vector &grad, void *f_data) { - hillClimbing *obj = static_cast(f_data); - return obj->likelihoodFunction(x, grad, f_data); + hillClimbing *obj = static_cast(f_data); + return obj->likelihoodFunction(x, grad, f_data); } -hillClimbing::hillClimbing(const char* sequenceFile, const char* configFile, - double lengthInternal, double lengthExternal, - double probFixDInternal, double probFixDExternal, +hillClimbing::hillClimbing(const char *sequenceFile, + const char *configFile, + double lengthInternal, + double lengthExternal, + double probFixDInternal, + double probFixDExternal, double externalUlindiDerivedProb) { // convergence_20160330_222635.log -// time_t t = time(0); // get time now -// struct tm * now = localtime( & t ); -// logFileName = "./convergence_" -// + (now->tm_year + 1900).str() -// + (now->tm_mon + 1).str() -// + now->tm_mday + '_' -// + now->tm_hour -// + now->tm_min -// + now->tm_sec -// + ".log"; + // time_t t = time(0); // get time now + // struct tm * now = localtime( & t ); + // logFileName = "./convergence_" + // + (now->tm_year + 1900).str() + // + (now->tm_mon + 1).str() + // + now->tm_mday + '_' + // + now->tm_hour + // + now->tm_min + // + now->tm_sec + // + ".log"; logFileName = "./convergence.log"; mOptTolerance = DEFAULT_OPT_TOLERANCE; - mObsSequence = new obsSequence() ; + mObsSequence = new obsSequence(); mModelProb = new modelProb(); mModelProbTemp = NULL; @@ -40,11 +43,11 @@ hillClimbing::hillClimbing(const char* sequenceFile, const char* configFile, mObsSequence->loadSequence(sequenceFile); mModelProb->loadProb(configFile); - //New mHmm class + // New mHmm class mHmm = new hmm(mObsSequence); mBaumWelch = new baumWelch(mObsSequence); - //Init mHmm members + // Init mHmm members mModelProb->setStayInternal(lengthInternal); mModelProb->setStayExternal(lengthExternal); mModelProb->setProbFixDInternal(probFixDInternal); @@ -52,16 +55,20 @@ hillClimbing::hillClimbing(const char* sequenceFile, const char* configFile, mModelProb->setProbSegregDExternal(externalUlindiDerivedProb); } -hillClimbing::hillClimbing(const char* sequenceFile, const char* configFile, - double lengthInternal, double lengthExternal, - double lengthLongExternal, double LongExternalrate, - double probFixDInternal, double probFixDExternal, - double externalUlindiDerivedProb) +hillClimbing::hillClimbing(const char *sequenceFile, + const char *configFile, + double lengthInternal, + double lengthExternal, + double lengthLongExternal, + double LongExternalrate, + double probFixDInternal, + double probFixDExternal, + double externalUlindiDerivedProb) { // convergence_20160330_222635.log - //time_t t = time(0); // get time now - //struct tm * now = localtime( & t ); - //logFileName = "./convergence_" + // time_t t = time(0); // get time now + // struct tm * now = localtime( & t ); + // logFileName = "./convergence_" // + (now->tm_year + 1900).str() // + (now->tm_mon + 1).str() // + now->tm_mday + '_' @@ -71,11 +78,10 @@ hillClimbing::hillClimbing(const char* sequenceFile, const char* configFile, // + ".log"; - logFileName = "./convergence.log"; mOptTolerance = DEFAULT_OPT_TOLERANCE; - mObsSequence = new obsSequence() ; + mObsSequence = new obsSequence(); mModelProb = new modelProb(); mModelProbTemp = NULL; @@ -86,11 +92,11 @@ hillClimbing::hillClimbing(const char* sequenceFile, const char* configFile, mObsSequence->loadSequence(sequenceFile); mModelProb->loadProb(configFile); - //New mHmm class + // New mHmm class mHmm = new hmm(mObsSequence); mBaumWelch = new baumWelch(mObsSequence); - //Init mHmm members + // Init mHmm members mModelProb->setStayInternal(lengthInternal); mModelProb->setStayExternal(lengthExternal); mModelProb->setStayLongExternal(lengthLongExternal); @@ -109,7 +115,7 @@ hillClimbing::~hillClimbing() delete mBaumWelch; } -void hillClimbing::setLogFileName(const char* name) +void hillClimbing::setLogFileName(const char *name) { logFileName = name; } @@ -140,23 +146,23 @@ void hillClimbing::randomSearch(double converThrld, int maxIteration, double win { mHmm->computeFwdBwd(mModelProb, mHmmResults); - double windowShrinkage = 1; + double windowShrinkage = 1; srand(time(NULL)); // Run the random search double stayInternal; double stayExternal; - map proba_obsD_int ; + map proba_obsD_int; stayInternal = mModelProb->getStayInternal(); stayExternal = mModelProb->getStayExternal(); - for (int n=1; n<=maxIteration; n++) + for (int n = 1; n <= maxIteration; n++) { - int step=1 ; - double baum_welch_LogLikelihood=2*mHmmResults->mlogLikelihood; - int max_iteration = 40;//Change this too + int step = 1; + double baum_welch_LogLikelihood = 2 * mHmmResults->mlogLikelihood; + int max_iteration = 40; // Change this too mModelProbTemp = new modelProb(); mHmmResultsTemp = new hmmResults(); @@ -164,36 +170,36 @@ void hillClimbing::randomSearch(double converThrld, int maxIteration, double win mModelProbTemp->resizeProbSegregDInternal(mModelProb->getSizeProbSegregDInternal()); // Propose new values for the parameters of the transition probabilities - mModelProbTemp->setStayInternal(mModelProb->getStayInternal() + (rand() % (int)(windowInt/windowShrinkage)) - (int)((windowInt/windowShrinkage)/2)); - mModelProbTemp->setStayExternal(mModelProb->getStayExternal() + (rand() % (int)(windowExt/windowShrinkage)) - (int)((windowExt/windowShrinkage)/2)); + mModelProbTemp->setStayInternal(mModelProb->getStayInternal() + (rand() % (int)(windowInt / windowShrinkage)) - (int)((windowInt / windowShrinkage) / 2)); + mModelProbTemp->setStayExternal(mModelProb->getStayExternal() + (rand() % (int)(windowExt / windowShrinkage)) - (int)((windowExt / windowShrinkage) / 2)); if (mModelProbTemp->getStayInternal() < 0.0) - mModelProbTemp->setStayInternal(-mModelProbTemp->getStayInternal()); - else if(mModelProbTemp->getStayInternal() == 0.0) - cerr << "stayInt = 0" << endl; + mModelProbTemp->setStayInternal(-mModelProbTemp->getStayInternal()); + else if (mModelProbTemp->getStayInternal() == 0.0) + cerr << "stayInt = 0" << endl; if (mModelProbTemp->getStayExternal() < 0.0) - mModelProbTemp->setStayExternal(-mModelProbTemp->getStayExternal()); - else if(mModelProbTemp->getStayExternal() == 0.0) - cerr << "stayExt = 0" << endl; + mModelProbTemp->setStayExternal(-mModelProbTemp->getStayExternal()); + else if (mModelProbTemp->getStayExternal() == 0.0) + cerr << "stayExt = 0" << endl; // We keep this parameter throughout the algorithm because somehow we cannot re-estimate it mModelProbTemp->setProbSegregDExternal(mModelProb->getProbSegregDExternal()); - // Run the Baum-Welch algorithm with the new values + // Run the Baum-Welch algorithm with the new values mHmmResultsTemp->mlogLikelihood = mHmmResults->mlogLikelihood; // La boucle while correspond a une methode convergenceBaumWelch autonome // qui serait appellee par les differents algo qui proposent des valeurs stayInternal et External (convergence des transitions) - while((stepmlogLikelihood - baum_welch_LogLikelihood)>converThrld)) + while ((step < max_iteration) && ((mHmmResultsTemp->mlogLikelihood - baum_welch_LogLikelihood) > converThrld)) { - //ostringstream oss; - //oss << step; - //string logInfo = oss.str(); - //mHmm.writeLogFile("./convergence.log",logInfo); + // ostringstream oss; + // oss << step; + // string logInfo = oss.str(); + // mHmm.writeLogFile("./convergence.log",logInfo); step++; - //keep previous likelihood to detect convergence + // keep previous likelihood to detect convergence baum_welch_LogLikelihood = mHmmResultsTemp->mlogLikelihood; mBaumWelch->computeBaumWelch(mHmmResults, mModelProbTemp); mHmm->computeFwdBwd(mModelProbTemp, mHmmResultsTemp); @@ -202,27 +208,27 @@ void hillClimbing::randomSearch(double converThrld, int maxIteration, double win // Accept or reject the new values. if (mHmmResults->mlogLikelihood < mHmmResultsTemp->mlogLikelihood) { - //Keep new model + // Keep new model delete mModelProb; mModelProb = mModelProbTemp; - //Update Results + // Update Results delete mHmmResults; mHmmResults = mHmmResultsTemp; - if (windowShrinkage>1) - windowShrinkage--; + if (windowShrinkage > 1) + windowShrinkage--; ostringstream oss; oss << step; string logInfo = oss.str(); - mHmm->writeLogFile(logFileName,logInfo); + mHmm->writeLogFile(logFileName, logInfo); } else { - //Reject new model + // Reject new model delete mModelProbTemp; - //Reject results + // Reject results delete mHmmResultsTemp; mModelProbTemp = NULL; mHmmResultsTemp = NULL; @@ -230,21 +236,21 @@ void hillClimbing::randomSearch(double converThrld, int maxIteration, double win } // Convergence? - if (windowShrinkage==maxWindowShrinkage) + if (windowShrinkage == maxWindowShrinkage) break; } - //write output file with last parameters - mHmm->computeFwdBwd(mModelProb, mHmmResults);//TODO methode pour set HmmResults afin de changer le pointeur pour ecrire OutputFile + // write output file with last parameters + mHmm->computeFwdBwd(mModelProb, mHmmResults); // TODO methode pour set HmmResults afin de changer le pointeur pour ecrire OutputFile mHmm->writeOutputFile(); - } -void hillClimbing::randomSearch3states(double converThrld, int maxIteration, double windowInt, double windowExt, double windowLongExt, double windowLErate, double maxWindowShrinkage) +void hillClimbing::randomSearch3states( + double converThrld, int maxIteration, double windowInt, double windowExt, double windowLongExt, double windowLErate, double maxWindowShrinkage) { - //cout << "Here0" << flush; + // cout << "Here0" << flush; mHmm->computeFwdBwd3states(mModelProb, mHmmResults); - double windowShrinkage = 1; + double windowShrinkage = 1; srand(time(NULL)); // Run the random search @@ -253,19 +259,19 @@ void hillClimbing::randomSearch3states(double converThrld, int maxIteration, dou double stayExternal; double stayLongExternal; double LErate; - map proba_obsD_int ; + map proba_obsD_int; stayInternal = mModelProb->getStayInternal(); stayExternal = mModelProb->getStayExternal(); stayLongExternal = mModelProb->getStayLongExternal(); LErate = mModelProb->getLErate(); - for (int n=1; n<=maxIteration; n++) + for (int n = 1; n <= maxIteration; n++) { - //cout << "Here" << flush; - int step=1 ; - double baum_welch_LogLikelihood=2*mHmmResults->mlogLikelihood; - int max_iteration = 40;//Change this too + // cout << "Here" << flush; + int step = 1; + double baum_welch_LogLikelihood = 2 * mHmmResults->mlogLikelihood; + int max_iteration = 40; // Change this too mModelProbTemp = new modelProb(); mHmmResultsTemp = new hmmResults(); @@ -273,33 +279,34 @@ void hillClimbing::randomSearch3states(double converThrld, int maxIteration, dou mModelProbTemp->resizeProbSegregDInternal(mModelProb->getSizeProbSegregDInternal()); // Propose new values for the parameters of the transition probabilities - mModelProbTemp->setStayInternal(mModelProb->getStayInternal() + (rand() % (int)(windowInt/windowShrinkage)) - (int)((windowInt/windowShrinkage)/2)); - mModelProbTemp->setStayExternal(mModelProb->getStayExternal() + (rand() % (int)(windowExt/windowShrinkage)) - (int)((windowExt/windowShrinkage)/2)); - mModelProbTemp->setStayLongExternal(mModelProb->getStayLongExternal() + (rand() % (int)(windowLongExt/windowShrinkage)) - (int)((windowLongExt/windowShrinkage)/2)); - mModelProbTemp->setLErate(mModelProb->getLErate() + (((double)rand() / RAND_MAX) * (double)(windowLErate/windowShrinkage)) - (double)((windowLErate/windowShrinkage)/2)); + mModelProbTemp->setStayInternal(mModelProb->getStayInternal() + (rand() % (int)(windowInt / windowShrinkage)) - (int)((windowInt / windowShrinkage) / 2)); + mModelProbTemp->setStayExternal(mModelProb->getStayExternal() + (rand() % (int)(windowExt / windowShrinkage)) - (int)((windowExt / windowShrinkage) / 2)); + mModelProbTemp->setStayLongExternal(mModelProb->getStayLongExternal() + (rand() % (int)(windowLongExt / windowShrinkage)) - (int)((windowLongExt / windowShrinkage) / 2)); + mModelProbTemp->setLErate(mModelProb->getLErate() + (((double)rand() / RAND_MAX) * (double)(windowLErate / windowShrinkage)) + - (double)((windowLErate / windowShrinkage) / 2)); if (mModelProbTemp->getStayInternal() < 0.0) - mModelProbTemp->setStayInternal(-mModelProbTemp->getStayInternal()); - else if(mModelProbTemp->getStayInternal() == 0.0) - cerr << "stayInt = 0" << endl; + mModelProbTemp->setStayInternal(-mModelProbTemp->getStayInternal()); + else if (mModelProbTemp->getStayInternal() == 0.0) + cerr << "stayInt = 0" << endl; if (mModelProbTemp->getStayExternal() < 0.0) - mModelProbTemp->setStayExternal(-mModelProbTemp->getStayExternal()); - else if(mModelProbTemp->getStayExternal() == 0.0) - cerr << "stayExt = 0" << endl; + mModelProbTemp->setStayExternal(-mModelProbTemp->getStayExternal()); + else if (mModelProbTemp->getStayExternal() == 0.0) + cerr << "stayExt = 0" << endl; if (mModelProbTemp->getStayLongExternal() < mModelProbTemp->getStayExternal()) - mModelProbTemp->setStayLongExternal(2*mModelProbTemp->getStayExternal() - mModelProbTemp->getStayLongExternal()); - else if(mModelProbTemp->getStayLongExternal() == 0.0) - cerr << "stayLongExt = 0" << endl; + mModelProbTemp->setStayLongExternal(2 * mModelProbTemp->getStayExternal() - mModelProbTemp->getStayLongExternal()); + else if (mModelProbTemp->getStayLongExternal() == 0.0) + cerr << "stayLongExt = 0" << endl; if (mModelProbTemp->getLErate() < 0.0) mModelProbTemp->setLErate(-mModelProbTemp->getLErate()); else if (mModelProbTemp->getLErate() > 1.0) - mModelProbTemp->setLErate(2*mModelProbTemp->getLErate() - 1); - + mModelProbTemp->setLErate(2 * mModelProbTemp->getLErate() - 1); - // We keep this parameter throughout the algorithm because somehow we cannot re-estimate it + + // We keep this parameter throughout the algorithm because somehow we cannot re-estimate it mModelProbTemp->setProbSegregDExternal(mModelProb->getProbSegregDExternal()); // Run the Baum-Welch algorithm with the new values @@ -308,43 +315,43 @@ void hillClimbing::randomSearch3states(double converThrld, int maxIteration, dou // La boucle while correspond a une methode convergenceBaumWelch autonome // qui serait appellee par les differents algo qui proposent des valeurs stayInternal et External (convergence des transitions) - while((stepmlogLikelihood - baum_welch_LogLikelihood)>converThrld)) + while ((step < max_iteration) && ((mHmmResultsTemp->mlogLikelihood - baum_welch_LogLikelihood) > converThrld)) { - //ostringstream oss; - //oss << step; - //string logInfo = oss.str(); - //mHmm.writeLogFile("./convergence.log",logInfo); + // ostringstream oss; + // oss << step; + // string logInfo = oss.str(); + // mHmm.writeLogFile("./convergence.log",logInfo); step++; - //keep previous likelihood to detect convergence + // keep previous likelihood to detect convergence baum_welch_LogLikelihood = mHmmResultsTemp->mlogLikelihood; mBaumWelch->computeBaumWelch3states(mHmmResults, mModelProbTemp); mHmm->computeFwdBwd3states(mModelProbTemp, mHmmResultsTemp); } - + // Accept or reject the new values. if (mHmmResults->mlogLikelihood < mHmmResultsTemp->mlogLikelihood) { - //Keep new model + // Keep new model delete mModelProb; mModelProb = mModelProbTemp; - //Update Results + // Update Results delete mHmmResults; mHmmResults = mHmmResultsTemp; - if (windowShrinkage>1) - windowShrinkage--; + if (windowShrinkage > 1) + windowShrinkage--; ostringstream oss; oss << step; string logInfo = oss.str(); - mHmm->writeLogFile3states(logFileName,logInfo); + mHmm->writeLogFile3states(logFileName, logInfo); } else { - //Reject new model + // Reject new model delete mModelProbTemp; - //Reject results + // Reject results delete mHmmResultsTemp; mModelProbTemp = NULL; mHmmResultsTemp = NULL; @@ -352,45 +359,41 @@ void hillClimbing::randomSearch3states(double converThrld, int maxIteration, dou } // Convergence? - if (windowShrinkage==maxWindowShrinkage) + if (windowShrinkage == maxWindowShrinkage) break; } - //write output file with last parameters - mHmm->computeFwdBwd3states(mModelProb, mHmmResults);//TODO methode pour set HmmResults afin de changer le pointeur pour ecrire OutputFile + // write output file with last parameters + mHmm->computeFwdBwd3states(mModelProb, mHmmResults); // TODO methode pour set HmmResults afin de changer le pointeur pour ecrire OutputFile mHmm->writeOutputFile3states(); - } - - - -//optimisation d'un modele +// optimisation d'un modele void hillClimbing::optimisationBaumWelch(modelProb *model, hmmResults *results, int max_iteration) { int step = 1; double baum_welch_LogLikelihood; - //Start from base model + // Start from base model baum_welch_LogLikelihood = mHmmResults->mlogLikelihood; mBaumWelch->computeBaumWelch3states(mHmmResults, model); mHmm->computeFwdBwd3states(model, results); - while((stepmlogLikelihood - baum_welch_LogLikelihood)>mConverThrld) + while ((step < max_iteration) && (results->mlogLikelihood - baum_welch_LogLikelihood) > mConverThrld) { step++; - //keep previous likelihood to detect convergence + // keep previous likelihood to detect convergence baum_welch_LogLikelihood = results->mlogLikelihood; mBaumWelch->computeBaumWelch3states(results, model); mHmm->computeFwdBwd3states(model, results); } } -//Function pour nlopt +// Function pour nlopt -//double f(const std::vector &x, std::vector &grad, void* f_data); +// double f(const std::vector &x, std::vector &grad, void* f_data); -double hillClimbing::likelihoodFunction(const std::vector &x, std::vector &grad, void* f_data) +double hillClimbing::likelihoodFunction(const std::vector &x, std::vector &grad, void *f_data) { double likelihood; @@ -416,15 +419,15 @@ double hillClimbing::likelihoodFunction(const std::vector &x, std::vecto likelihood = mHmmResultsTemp->mlogLikelihood; - //Write log file + // Write log file ostringstream oss; - //oss << step; + // oss << step; string logInfo = oss.str(); - mHmm->writeLogFile3states(logFileName,logInfo); + mHmm->writeLogFile3states(logFileName, logInfo); - //Reject new model + // Reject new model delete mModelProbTemp; - //Reject results + // Reject results delete mHmmResultsTemp; mModelProbTemp = NULL; mHmmResultsTemp = NULL; @@ -432,83 +435,81 @@ double hillClimbing::likelihoodFunction(const std::vector &x, std::vecto return likelihood; } -double constraint(const std::vector &x, std::vector &grad, void* f_data) +double constraint(const std::vector &x, std::vector &grad, void *f_data) { - return x.at(IDX_STAY_EXTERNAL) - x.at(IDX_STAY_LONG_INTERNAL); //stayExternal - stayLongExternal <= 0 + return x.at(IDX_STAY_EXTERNAL) - x.at(IDX_STAY_LONG_INTERNAL); // stayExternal - stayLongExternal <= 0 } -//Algo de recherche avec nlopt +// Algo de recherche avec nlopt double hillClimbing::nelderMeadSimplex(double converThrld) { mConverThrld = converThrld; mHmm->computeFwdBwd3states(mModelProb, mHmmResults); - //Optimisation du modèle de base + // Optimisation du modèle de base optimisationBaumWelch(mModelProb, mHmmResults, 40); - //A partir du modèle précédent on utilise nlopt pour rechercher le maximum en fonctions des 4 paramètres : - //StayInternal - //StayExternal - //StayLongExternal - //LErate + // A partir du modèle précédent on utilise nlopt pour rechercher le maximum en fonctions des 4 paramètres : + // StayInternal + // StayExternal + // StayLongExternal + // LErate - //Init nlopt -// mOpt = new nlopt::opt(nlopt::LN_NELDERMEAD, 4); + // Init nlopt + // mOpt = new nlopt::opt(nlopt::LN_NELDERMEAD, 4); mOpt = new nlopt::opt(nlopt::LN_COBYLA, 4); - //Borner limites nlopt + // Borner limites nlopt mOpt->set_lower_bounds(0.0); - double mybounds[] = {HUGE_VAL, HUGE_VAL, HUGE_VAL, 1.0}; //HUGE_VAL inclus dans math.h - vector values(mybounds, mybounds + sizeof(mybounds) / sizeof(double) ); + double mybounds[] = {HUGE_VAL, HUGE_VAL, HUGE_VAL, 1.0}; // HUGE_VAL inclus dans math.h + vector values(mybounds, mybounds + sizeof(mybounds) / sizeof(double)); mOpt->set_upper_bounds(values); - //mOpt->set_upper_bounds(vector({HUGE_VAL, HUGE_VAL, HUGE_VAL, 1.0}, 4)); //HUGE_VAL math.h + // mOpt->set_upper_bounds(vector({HUGE_VAL, HUGE_VAL, HUGE_VAL, 1.0}, 4)); //HUGE_VAL math.h - //Contrainte stayLongExtrnal > stayExternal + // Contrainte stayLongExtrnal > stayExternal mOpt->add_inequality_constraint(constraint, NULL, 0.0); - //Criteria + // Criteria mOpt->set_ftol_abs(mOptTolerance); - mOpt->set_maxeval(1000);//Max iteration + mOpt->set_maxeval(1000); // Max iteration mOpt->set_max_objective(wrapObjFunc, this); - //String de description de l'algorithme obtenue avec - //const char *nlopt::opt::get_algorithm_name() const; - + // String de description de l'algorithme obtenue avec + // const char *nlopt::opt::get_algorithm_name() const; - //Initialise parametres de recherche - vector x(4,0.0); + // Initialise parametres de recherche + vector x(4, 0.0); double opt_f; x.at(IDX_STAY_INTERNAL) = mModelProb->getStayInternal(); x.at(IDX_STAY_EXTERNAL) = mModelProb->getStayExternal(); x.at(IDX_STAY_LONG_INTERNAL) = mModelProb->getStayLongExternal(); x.at(IDX_LERATE) = mModelProb->getLErate(); - //appel nlopt pour faire la recherche - char result; //1 octet : va de -128 jusqu'à +127 + // appel nlopt pour faire la recherche + char result; // 1 octet : va de -128 jusqu'à +127 result = mOpt->optimize(x, opt_f); - if(result > 0) + if (result > 0) { - //Opt ok - //write output file with best parameters + // Opt ok + // write output file with best parameters - mModelProb->setStayInternal(x.at(IDX_STAY_INTERNAL)); + mModelProb->setStayInternal(x.at(IDX_STAY_INTERNAL)); mModelProb->setStayExternal(x.at(IDX_STAY_EXTERNAL)); mModelProb->setStayLongExternal(x.at(IDX_STAY_LONG_INTERNAL)); mModelProb->setLErate(x.at(IDX_LERATE)); optimisationBaumWelch(mModelProb, mHmmResults, 40); - //likelihoodFunction(x, grad, NULL); + // likelihoodFunction(x, grad, NULL); mHmm->writeOutputFile3states(); - } else { - cout << "Nlopt optimisation failed: " << result << "\t";//erreur + cout << "Nlopt optimisation failed: " << result << "\t"; // erreur } delete mOpt; } diff --git a/hillclimbing.h b/hillclimbing.h index 698c8b7..bea3842 100644 --- a/hillclimbing.h +++ b/hillclimbing.h @@ -1,11 +1,11 @@ #ifndef HILLCLIMBING_H #define HILLCLIMBING_H -#include "hmm.h" #include "baumwelch.h" +#include "hmm.h" #include -//Random search default values +// Random search default values #define MAX_ITERATIONS 10000 #define WINDOW_INT 4000 #define WINDOW_EXT 2000 @@ -15,7 +15,8 @@ #define DEFAULT_OPT_TOLERANCE 0.0001 -enum{ +enum +{ IDX_STAY_INTERNAL = 0, IDX_STAY_EXTERNAL, IDX_STAY_LONG_INTERNAL, @@ -24,7 +25,7 @@ enum{ class hillClimbing { -private: + private: nlopt::opt *mOpt; obsSequence *mObsSequence; @@ -39,48 +40,52 @@ class hillClimbing baumWelch *mBaumWelch; double mConverThrld; - double mOptTolerance; //Tolerance de fin d'optimisation - const char* logFileName; + double mOptTolerance; // Tolerance de fin d'optimisation + const char *logFileName; void optimisationBaumWelch(modelProb *model, hmmResults *results, int max_iteration); -public: - //hillClimbing(); - hillClimbing(const char* sequenceFile, const char* configFile, - double lengthInternal, double lengthExternal, - double probFixDInternal, double probFixDExternal, + public: + // hillClimbing(); + hillClimbing(const char *sequenceFile, + const char *configFile, + double lengthInternal, + double lengthExternal, + double probFixDInternal, + double probFixDExternal, + double externalUlindiDerivedProb); + hillClimbing(const char *sequenceFile, + const char *configFile, + double lengthInternal, + double lengthExternal, + double lengthLongExternal, + double LongExternalrate, + double probFixDInternal, + double probFixDExternal, double externalUlindiDerivedProb); - hillClimbing(const char* sequenceFile, const char* configFile, - double lengthInternal, double lengthExternal, - double lengthLongExternal, double LongExternalrate, - double probFixDInternal, double probFixDExternal, - double externalUlindiDerivedProb); ~hillClimbing(); - void setLogFileName(const char*); + void setLogFileName(const char *); void hmmOnly(); void hmmOnly3states(); - //void randomSearch(int maxIteration); - //void randomSearch(int maxIteration, double windowInt, double windowExt); - void randomSearch( double converThrld, int maxIteration = MAX_ITERATIONS, - double windowInt = WINDOW_INT, - double windowExt = WINDOW_EXT, - double maxWindowShrinkage = MAX_WINDOW_SHRINKAGE); + // void randomSearch(int maxIteration); + // void randomSearch(int maxIteration, double windowInt, double windowExt); + void randomSearch( + double converThrld, int maxIteration = MAX_ITERATIONS, double windowInt = WINDOW_INT, double windowExt = WINDOW_EXT, double maxWindowShrinkage = MAX_WINDOW_SHRINKAGE); - void randomSearch3states( double converThrld, int maxIteration = MAX_ITERATIONS, - double windowInt = WINDOW_INT, - double windowExt = WINDOW_EXT, - double windowLongExt = WINDOW_LONGEXT, - double windowLErate = WINDOW_LERATE, - double maxWindowShrinkage = MAX_WINDOW_SHRINKAGE); + void randomSearch3states(double converThrld, + int maxIteration = MAX_ITERATIONS, + double windowInt = WINDOW_INT, + double windowExt = WINDOW_EXT, + double windowLongExt = WINDOW_LONGEXT, + double windowLErate = WINDOW_LERATE, + double maxWindowShrinkage = MAX_WINDOW_SHRINKAGE); double nelderMeadSimplex(double converThrld); - double likelihoodFunction(const std::vector &x, std::vector &grad, void* f_data); - + double likelihoodFunction(const std::vector &x, std::vector &grad, void *f_data); }; #endif // HILLCLIMBING_H - diff --git a/hmm.cc b/hmm.cc index 4b4edde..bf0dbc1 100644 --- a/hmm.cc +++ b/hmm.cc @@ -1,35 +1,37 @@ #include "hmm.h" #include -hmm::hmm( obsSequence *seq) +hmm::hmm(obsSequence *seq) { - mObsSequence = seq; - mModelProb = NULL; - mHmmResults = NULL; + mObsSequence = seq; + mModelProb = NULL; + mHmmResults = NULL; } -double hmm::prob_obs_internal( obsSite dat, probSegregDInternal prob ) +double hmm::prob_obs_internal(obsSite dat, probSegregDInternal prob) { - if ( dat.getDerived() < dat.getCoverage() ) - if ( dat.getUlindi() == 'D' ) - return prob.getProb( dat.getDerived() ) ; - else return 1.-prob.getProb( dat.getDerived() ) ; - else - if ( dat.getUlindi() == 'D' ) - return mModelProb->getProbFixDInternal() ; - else return 1.-mModelProb->getProbFixDInternal() ; + if (dat.getDerived() < dat.getCoverage()) + if (dat.getUlindi() == 'D') + return prob.getProb(dat.getDerived()); + else + return 1. - prob.getProb(dat.getDerived()); + else if (dat.getUlindi() == 'D') + return mModelProb->getProbFixDInternal(); + else + return 1. - mModelProb->getProbFixDInternal(); } -double hmm::prob_obs_external( obsSite dat ) +double hmm::prob_obs_external(obsSite dat) { - if ( dat.getDerived() < dat.getCoverage() ) - if ( dat.getUlindi() == 'D' ) - return mModelProb->getProbSegregDExternal() ; - else return 1.-mModelProb->getProbSegregDExternal() ; - else - if ( dat.getUlindi() == 'D' ) - return mModelProb->getProbFixDExternal() ; - else return 1.-mModelProb->getProbFixDExternal() ; + if (dat.getDerived() < dat.getCoverage()) + if (dat.getUlindi() == 'D') + return mModelProb->getProbSegregDExternal(); + else + return 1. - mModelProb->getProbSegregDExternal(); + else if (dat.getUlindi() == 'D') + return mModelProb->getProbFixDExternal(); + else + return 1. - mModelProb->getProbFixDExternal(); } void hmm::computeFwdBwd(modelProb *model, hmmResults *result) @@ -38,69 +40,75 @@ void hmm::computeFwdBwd(modelProb *model, hmmResults *result) mModelProb = model; mHmmResults = result; - ////////// - // forward probabilities - mHmmResults->mfwd_stateE_scaled.resize( mObsSequence->size(), 0.0 ) ; // external - mHmmResults->mfwd_stateI_scaled.resize( mObsSequence->size(), 0.0 ) ; // internal - // first one by hand - vector fwd_scaling( mObsSequence->size(), 0.0 ) ; - // Initial state = internal - fwd_scaling[0] = prob_obs_internal( mObsSequence->at(0), mModelProb->getProbSegregDInternal() ) ; - mHmmResults->mfwd_stateI_scaled[0] = 1. ; // == fwd_stateI[0]/fwd_scaling[0] ; - // rest - for ( int i = 1 ; i < mObsSequence->size() ; ++i ) { - - double I_tmp = prob_obs_internal( mObsSequence->at(i), mModelProb->getProbSegregDInternal() ) - *mHmmResults->mfwd_stateI_scaled[i-1]*1./exp(mObsSequence->at(i).getDist()/mModelProb->getStayInternal()) + - prob_obs_internal( mObsSequence->at(i), mModelProb->getProbSegregDInternal() ) - *mHmmResults->mfwd_stateE_scaled[i-1]*(1.-1./exp(mObsSequence->at(i).getDist()/mModelProb->getStayExternal())) ; - double E_tmp = ( prob_obs_external( mObsSequence->at(i) ) - * mHmmResults->mfwd_stateI_scaled[i-1]*(1.-1./exp(mObsSequence->at(i).getDist()/mModelProb->getStayInternal())) + - prob_obs_external( mObsSequence->at(i) ) - * mHmmResults->mfwd_stateE_scaled[i-1]*1./exp(mObsSequence->at(i).getDist()/mModelProb->getStayExternal()) ) ; - - fwd_scaling[i] = I_tmp + E_tmp ; - - mHmmResults->mfwd_stateI_scaled[i] = I_tmp/fwd_scaling[i] ; - mHmmResults->mfwd_stateE_scaled[i] = E_tmp/fwd_scaling[i] ; - } - - ////////////// - // Log-likelihood - mHmmResults->mlogLikelihood = 0.0; - for ( int i = 1 ; i < mObsSequence->size() ; ++i ) { - mHmmResults->mlogLikelihood += log(fwd_scaling[i]); - } - - ////////////////// - // backward probabilities - mHmmResults->mbwd_stateI_scaled.resize( mObsSequence->size(), 0.0 ) ; - mHmmResults->mbwd_stateE_scaled.resize( mObsSequence->size(), 0.0 ) ; - mHmmResults->mbwd_stateI_scaled[mObsSequence->size()-1] = 1. ; - mHmmResults->mbwd_stateE_scaled[mObsSequence->size()-1] = 1. ; - for ( int i = mObsSequence->size()-2 ; i >= 0 ; i-- ) { - - double scale = fwd_scaling[i+1] ; - mHmmResults->mbwd_stateI_scaled[i] = (exp(-1.*mObsSequence->at(i+1).getDist()/mModelProb->getStayInternal()) - * prob_obs_internal( mObsSequence->at(i+1), mModelProb->getProbSegregDInternal() )*mHmmResults->mbwd_stateI_scaled[i+1] + - (1.-exp(-1.*mObsSequence->at(i+1).getDist()/mModelProb->getStayInternal())) - * prob_obs_external( mObsSequence->at(i+1) )*mHmmResults->mbwd_stateE_scaled[i+1])/scale ; - - mHmmResults->mbwd_stateE_scaled[i] = ((1.-exp(-1.*mObsSequence->at(i+1).getDist()/mModelProb->getStayExternal())) - * prob_obs_internal( mObsSequence->at(i+1), mModelProb->getProbSegregDInternal() )*mHmmResults->mbwd_stateI_scaled[i+1] + - exp(-1.*mObsSequence->at(i+1).getDist()/mModelProb->getStayExternal()) * prob_obs_external( mObsSequence->at(i+1) ) - *mHmmResults->mbwd_stateE_scaled[i+1])/scale ; - - } - - ////////////////////// - // posterior decoding - mHmmResults->mExternal.resize( mObsSequence->size(), 0 ) ; // == 1 if external - for ( size_t i = 0 ; i < mObsSequence->size() ; ++i ) { - double val = (mHmmResults->mfwd_stateI_scaled[i]*mHmmResults->mbwd_stateI_scaled[i]) ; - if ( val < 0.2 ) mHmmResults->mExternal[i] = 1 ; - else mHmmResults->mExternal[i] = 0 ; - } + ////////// + // forward probabilities + mHmmResults->mfwd_stateE_scaled.resize(mObsSequence->size(), 0.0); // external + mHmmResults->mfwd_stateI_scaled.resize(mObsSequence->size(), 0.0); // internal + // first one by hand + vector fwd_scaling(mObsSequence->size(), 0.0); + // Initial state = internal + fwd_scaling[0] = prob_obs_internal(mObsSequence->at(0), mModelProb->getProbSegregDInternal()); + mHmmResults->mfwd_stateI_scaled[0] = 1.; // == fwd_stateI[0]/fwd_scaling[0] ; + // rest + for (int i = 1; i < mObsSequence->size(); ++i) + { + + double I_tmp = prob_obs_internal(mObsSequence->at(i), mModelProb->getProbSegregDInternal()) * mHmmResults->mfwd_stateI_scaled[i - 1] * 1. + / exp(mObsSequence->at(i).getDist() / mModelProb->getStayInternal()) + + prob_obs_internal(mObsSequence->at(i), mModelProb->getProbSegregDInternal()) * mHmmResults->mfwd_stateE_scaled[i - 1] + * (1. - 1. / exp(mObsSequence->at(i).getDist() / mModelProb->getStayExternal())); + double E_tmp = + (prob_obs_external(mObsSequence->at(i)) * mHmmResults->mfwd_stateI_scaled[i - 1] * (1. - 1. / exp(mObsSequence->at(i).getDist() / mModelProb->getStayInternal())) + + prob_obs_external(mObsSequence->at(i)) * mHmmResults->mfwd_stateE_scaled[i - 1] * 1. / exp(mObsSequence->at(i).getDist() / mModelProb->getStayExternal())); + + fwd_scaling[i] = I_tmp + E_tmp; + + mHmmResults->mfwd_stateI_scaled[i] = I_tmp / fwd_scaling[i]; + mHmmResults->mfwd_stateE_scaled[i] = E_tmp / fwd_scaling[i]; + } + + ////////////// + // Log-likelihood + mHmmResults->mlogLikelihood = 0.0; + for (int i = 1; i < mObsSequence->size(); ++i) + { + mHmmResults->mlogLikelihood += log(fwd_scaling[i]); + } + + ////////////////// + // backward probabilities + mHmmResults->mbwd_stateI_scaled.resize(mObsSequence->size(), 0.0); + mHmmResults->mbwd_stateE_scaled.resize(mObsSequence->size(), 0.0); + mHmmResults->mbwd_stateI_scaled[mObsSequence->size() - 1] = 1.; + mHmmResults->mbwd_stateE_scaled[mObsSequence->size() - 1] = 1.; + for (int i = mObsSequence->size() - 2; i >= 0; i--) + { + + double scale = fwd_scaling[i + 1]; + mHmmResults->mbwd_stateI_scaled[i] = (exp(-1. * mObsSequence->at(i + 1).getDist() / mModelProb->getStayInternal()) + * prob_obs_internal(mObsSequence->at(i + 1), mModelProb->getProbSegregDInternal()) * mHmmResults->mbwd_stateI_scaled[i + 1] + + (1. - exp(-1. * mObsSequence->at(i + 1).getDist() / mModelProb->getStayInternal())) * prob_obs_external(mObsSequence->at(i + 1)) + * mHmmResults->mbwd_stateE_scaled[i + 1]) + / scale; + + mHmmResults->mbwd_stateE_scaled[i] = + ((1. - exp(-1. * mObsSequence->at(i + 1).getDist() / mModelProb->getStayExternal())) * prob_obs_internal(mObsSequence->at(i + 1), mModelProb->getProbSegregDInternal()) + * mHmmResults->mbwd_stateI_scaled[i + 1] + + exp(-1. * mObsSequence->at(i + 1).getDist() / mModelProb->getStayExternal()) * prob_obs_external(mObsSequence->at(i + 1)) * mHmmResults->mbwd_stateE_scaled[i + 1]) + / scale; + } + + ////////////////////// + // posterior decoding + mHmmResults->mExternal.resize(mObsSequence->size(), 0); // == 1 if external + for (size_t i = 0; i < mObsSequence->size(); ++i) + { + double val = (mHmmResults->mfwd_stateI_scaled[i] * mHmmResults->mbwd_stateI_scaled[i]); + if (val < 0.2) + mHmmResults->mExternal[i] = 1; + else + mHmmResults->mExternal[i] = 0; + } } void hmm::computeFwdBwd3states(modelProb *model, hmmResults *result) @@ -109,153 +117,138 @@ void hmm::computeFwdBwd3states(modelProb *model, hmmResults *result) mModelProb = model; mHmmResults = result; - ////////// - // forward probabilities - mHmmResults->mfwd_stateE_scaled.resize( mObsSequence->size(), 0.0 ) ; // external - mHmmResults->mfwd_stateLE_scaled.resize( mObsSequence->size(), 0.0 ) ; // long external - mHmmResults->mfwd_stateI_scaled.resize( mObsSequence->size(), 0.0 ) ; // internal - // first one by hand - vector fwd_scaling( mObsSequence->size(), 0.0 ) ; - // Initial state = internal - fwd_scaling[0] = prob_obs_internal( mObsSequence->at(0), mModelProb->getProbSegregDInternal() ) ; - mHmmResults->mfwd_stateI_scaled[0] = 1. ; // == fwd_stateI[0]/fwd_scaling[0] ; - // rest - for ( int i = 1 ; i < mObsSequence->size() ; ++i ) { - - double I_tmp = prob_obs_internal( mObsSequence->at(i), mModelProb->getProbSegregDInternal() ) - *mHmmResults->mfwd_stateI_scaled[i-1]*1./exp(mObsSequence->at(i).getDist()/mModelProb->getStayInternal()) + - prob_obs_internal( mObsSequence->at(i), mModelProb->getProbSegregDInternal() ) - *mHmmResults->mfwd_stateE_scaled[i-1]*(1.-1./exp(mObsSequence->at(i).getDist()/mModelProb->getStayExternal())) + - prob_obs_internal( mObsSequence->at(i), mModelProb->getProbSegregDInternal() ) - *mHmmResults->mfwd_stateLE_scaled[i-1]*(1.-1./exp(mObsSequence->at(i).getDist()/mModelProb->getStayLongExternal())) ; - - double E_tmp = ( prob_obs_external( mObsSequence->at(i) ) - * mHmmResults->mfwd_stateI_scaled[i-1]*(1-mModelProb->getLErate())*(1.-1./exp(mObsSequence->at(i).getDist()/mModelProb->getStayInternal())) + - prob_obs_external( mObsSequence->at(i) ) - * mHmmResults->mfwd_stateE_scaled[i-1]*1./exp(mObsSequence->at(i).getDist()/mModelProb->getStayExternal()) ) ; - - double LE_tmp = ( prob_obs_external( mObsSequence->at(i) ) - * mHmmResults->mfwd_stateI_scaled[i-1]*mModelProb->getLErate()*(1.-1./exp(mObsSequence->at(i).getDist()/mModelProb->getStayInternal())) + - prob_obs_external( mObsSequence->at(i) ) - * mHmmResults->mfwd_stateLE_scaled[i-1]*1./exp(mObsSequence->at(i).getDist()/mModelProb->getStayLongExternal()) ) ; - - fwd_scaling[i] = I_tmp + E_tmp + LE_tmp ; - - mHmmResults->mfwd_stateI_scaled[i] = I_tmp/fwd_scaling[i] ; - mHmmResults->mfwd_stateE_scaled[i] = E_tmp/fwd_scaling[i] ; - mHmmResults->mfwd_stateLE_scaled[i] = LE_tmp/fwd_scaling[i] ; - } - - ////////////// - // Log-likelihood - mHmmResults->mlogLikelihood = 0.0; - for ( int i = 1 ; i < mObsSequence->size() ; ++i ) { - mHmmResults->mlogLikelihood += log(fwd_scaling[i]); - } - ////////////////// - // backward probabilities - mHmmResults->mbwd_stateI_scaled.resize( mObsSequence->size(), 0.0 ) ; - mHmmResults->mbwd_stateE_scaled.resize( mObsSequence->size(), 0.0 ) ; - mHmmResults->mbwd_stateLE_scaled.resize( mObsSequence->size(), 0.0 ) ; - mHmmResults->mbwd_stateI_scaled[mObsSequence->size()-1] = 1. ; - mHmmResults->mbwd_stateE_scaled[mObsSequence->size()-1] = 1. ; - mHmmResults->mbwd_stateLE_scaled[mObsSequence->size()-1] = 1. ; - for ( int i = mObsSequence->size()-2 ; i >= 0 ; i-- ) { - - double scale = fwd_scaling[i+1] ; - mHmmResults->mbwd_stateI_scaled[i] = (exp(-1.*mObsSequence->at(i+1).getDist()/mModelProb->getStayInternal()) - * prob_obs_internal( mObsSequence->at(i+1), mModelProb->getProbSegregDInternal() )*mHmmResults->mbwd_stateI_scaled[i+1] + - (1-mModelProb->getLErate())*(1.-exp(-1.*mObsSequence->at(i+1).getDist()/mModelProb->getStayInternal())) - * prob_obs_external( mObsSequence->at(i+1) )*mHmmResults->mbwd_stateE_scaled[i+1] + - mModelProb->getLErate()*(1.-exp(-1.*mObsSequence->at(i+1).getDist()/mModelProb->getStayInternal())) - * prob_obs_external( mObsSequence->at(i+1) )*mHmmResults->mbwd_stateLE_scaled[i+1])/scale ; - - mHmmResults->mbwd_stateE_scaled[i] = ((1.-exp(-1.*mObsSequence->at(i+1).getDist()/mModelProb->getStayExternal())) - * prob_obs_internal( mObsSequence->at(i+1), mModelProb->getProbSegregDInternal() )*mHmmResults->mbwd_stateI_scaled[i+1] + - exp(-1.*mObsSequence->at(i+1).getDist()/mModelProb->getStayExternal()) * prob_obs_external( mObsSequence->at(i+1) ) - *mHmmResults->mbwd_stateE_scaled[i+1])/scale ; - - mHmmResults->mbwd_stateLE_scaled[i] = ((1.-exp(-1.*mObsSequence->at(i+1).getDist()/mModelProb->getStayLongExternal())) - * prob_obs_internal( mObsSequence->at(i+1), mModelProb->getProbSegregDInternal() )*mHmmResults->mbwd_stateI_scaled[i+1] + - exp(-1.*mObsSequence->at(i+1).getDist()/mModelProb->getStayLongExternal()) * prob_obs_external( mObsSequence->at(i+1) ) - *mHmmResults->mbwd_stateLE_scaled[i+1])/scale ; - - } - - ////////////////////// - // posterior decoding - mHmmResults->mExternal.resize( mObsSequence->size(), 0 ) ; // == 1 if external - for ( size_t i = 0 ; i < mObsSequence->size() ; ++i ) { - double val = (mHmmResults->mfwd_stateE_scaled[i]*mHmmResults->mbwd_stateE_scaled[i]) ; - double val2 = (mHmmResults->mfwd_stateLE_scaled[i]*mHmmResults->mbwd_stateLE_scaled[i]) ; - if ( val > 0.8 ) mHmmResults->mExternal[i] = 1 ; - else if ( val2 > 0.8 ) mHmmResults->mExternal[i] = 2 ; - else mHmmResults->mExternal[i] = 0 ; - } + ////////// + // forward probabilities + mHmmResults->mfwd_stateE_scaled.resize(mObsSequence->size(), 0.0); // external + mHmmResults->mfwd_stateLE_scaled.resize(mObsSequence->size(), 0.0); // long external + mHmmResults->mfwd_stateI_scaled.resize(mObsSequence->size(), 0.0); // internal + // first one by hand + vector fwd_scaling(mObsSequence->size(), 0.0); + // Initial state = internal + fwd_scaling[0] = prob_obs_internal(mObsSequence->at(0), mModelProb->getProbSegregDInternal()); + mHmmResults->mfwd_stateI_scaled[0] = 1.; // == fwd_stateI[0]/fwd_scaling[0] ; + // rest + for (int i = 1; i < mObsSequence->size(); ++i) + { + + double I_tmp = prob_obs_internal(mObsSequence->at(i), mModelProb->getProbSegregDInternal()) * mHmmResults->mfwd_stateI_scaled[i - 1] * 1. + / exp(mObsSequence->at(i).getDist() / mModelProb->getStayInternal()) + + prob_obs_internal(mObsSequence->at(i), mModelProb->getProbSegregDInternal()) * mHmmResults->mfwd_stateE_scaled[i - 1] + * (1. - 1. / exp(mObsSequence->at(i).getDist() / mModelProb->getStayExternal())) + + prob_obs_internal(mObsSequence->at(i), mModelProb->getProbSegregDInternal()) * mHmmResults->mfwd_stateLE_scaled[i - 1] + * (1. - 1. / exp(mObsSequence->at(i).getDist() / mModelProb->getStayLongExternal())); + + double E_tmp = + (prob_obs_external(mObsSequence->at(i)) * mHmmResults->mfwd_stateI_scaled[i - 1] * (1 - mModelProb->getLErate()) + * (1. - 1. / exp(mObsSequence->at(i).getDist() / mModelProb->getStayInternal())) + + prob_obs_external(mObsSequence->at(i)) * mHmmResults->mfwd_stateE_scaled[i - 1] * 1. / exp(mObsSequence->at(i).getDist() / mModelProb->getStayExternal())); + + double LE_tmp = + (prob_obs_external(mObsSequence->at(i)) * mHmmResults->mfwd_stateI_scaled[i - 1] * mModelProb->getLErate() + * (1. - 1. / exp(mObsSequence->at(i).getDist() / mModelProb->getStayInternal())) + + prob_obs_external(mObsSequence->at(i)) * mHmmResults->mfwd_stateLE_scaled[i - 1] * 1. / exp(mObsSequence->at(i).getDist() / mModelProb->getStayLongExternal())); + + fwd_scaling[i] = I_tmp + E_tmp + LE_tmp; + + mHmmResults->mfwd_stateI_scaled[i] = I_tmp / fwd_scaling[i]; + mHmmResults->mfwd_stateE_scaled[i] = E_tmp / fwd_scaling[i]; + mHmmResults->mfwd_stateLE_scaled[i] = LE_tmp / fwd_scaling[i]; + } + + ////////////// + // Log-likelihood + mHmmResults->mlogLikelihood = 0.0; + for (int i = 1; i < mObsSequence->size(); ++i) + { + mHmmResults->mlogLikelihood += log(fwd_scaling[i]); + } + ////////////////// + // backward probabilities + mHmmResults->mbwd_stateI_scaled.resize(mObsSequence->size(), 0.0); + mHmmResults->mbwd_stateE_scaled.resize(mObsSequence->size(), 0.0); + mHmmResults->mbwd_stateLE_scaled.resize(mObsSequence->size(), 0.0); + mHmmResults->mbwd_stateI_scaled[mObsSequence->size() - 1] = 1.; + mHmmResults->mbwd_stateE_scaled[mObsSequence->size() - 1] = 1.; + mHmmResults->mbwd_stateLE_scaled[mObsSequence->size() - 1] = 1.; + for (int i = mObsSequence->size() - 2; i >= 0; i--) + { + + double scale = fwd_scaling[i + 1]; + mHmmResults->mbwd_stateI_scaled[i] = (exp(-1. * mObsSequence->at(i + 1).getDist() / mModelProb->getStayInternal()) + * prob_obs_internal(mObsSequence->at(i + 1), mModelProb->getProbSegregDInternal()) * mHmmResults->mbwd_stateI_scaled[i + 1] + + (1 - mModelProb->getLErate()) * (1. - exp(-1. * mObsSequence->at(i + 1).getDist() / mModelProb->getStayInternal())) + * prob_obs_external(mObsSequence->at(i + 1)) * mHmmResults->mbwd_stateE_scaled[i + 1] + + mModelProb->getLErate() * (1. - exp(-1. * mObsSequence->at(i + 1).getDist() / mModelProb->getStayInternal())) + * prob_obs_external(mObsSequence->at(i + 1)) * mHmmResults->mbwd_stateLE_scaled[i + 1]) + / scale; + + mHmmResults->mbwd_stateE_scaled[i] = + ((1. - exp(-1. * mObsSequence->at(i + 1).getDist() / mModelProb->getStayExternal())) * prob_obs_internal(mObsSequence->at(i + 1), mModelProb->getProbSegregDInternal()) + * mHmmResults->mbwd_stateI_scaled[i + 1] + + exp(-1. * mObsSequence->at(i + 1).getDist() / mModelProb->getStayExternal()) * prob_obs_external(mObsSequence->at(i + 1)) * mHmmResults->mbwd_stateE_scaled[i + 1]) + / scale; + + mHmmResults->mbwd_stateLE_scaled[i] = ((1. - exp(-1. * mObsSequence->at(i + 1).getDist() / mModelProb->getStayLongExternal())) + * prob_obs_internal(mObsSequence->at(i + 1), mModelProb->getProbSegregDInternal()) * mHmmResults->mbwd_stateI_scaled[i + 1] + + exp(-1. * mObsSequence->at(i + 1).getDist() / mModelProb->getStayLongExternal()) * prob_obs_external(mObsSequence->at(i + 1)) + * mHmmResults->mbwd_stateLE_scaled[i + 1]) + / scale; + } + + ////////////////////// + // posterior decoding + mHmmResults->mExternal.resize(mObsSequence->size(), 0); // == 1 if external + for (size_t i = 0; i < mObsSequence->size(); ++i) + { + double val = (mHmmResults->mfwd_stateE_scaled[i] * mHmmResults->mbwd_stateE_scaled[i]); + double val2 = (mHmmResults->mfwd_stateLE_scaled[i] * mHmmResults->mbwd_stateLE_scaled[i]); + if (val > 0.8) + mHmmResults->mExternal[i] = 1; + else if (val2 > 0.8) + mHmmResults->mExternal[i] = 2; + else + mHmmResults->mExternal[i] = 0; + } } -void hmm::writeLogFile(const char* fileName, string logInfo) +void hmm::writeLogFile(const char *fileName, string logInfo) { - ofstream myfile; - myfile.open(fileName, std::ios_base::app); - myfile << std::setprecision (15) << logInfo << "\t" - << mHmmResults->mlogLikelihood << "\t" - << mModelProb->getStayInternal() << "\t" - << mModelProb->getStayExternal() << "\t" - << mModelProb->getProbFixDInternal() << "\t" - << mModelProb->getProbFixDExternal() << "\t" - << mModelProb->getProbSegregDExternal() << "\t" - << mModelProb->getProbSegregDInternal().printAll() - << "\n"; - myfile.close(); + ofstream myfile; + myfile.open(fileName, std::ios_base::app); + myfile << std::setprecision(15) << logInfo << "\t" << mHmmResults->mlogLikelihood << "\t" << mModelProb->getStayInternal() << "\t" << mModelProb->getStayExternal() << "\t" + << mModelProb->getProbFixDInternal() << "\t" << mModelProb->getProbFixDExternal() << "\t" << mModelProb->getProbSegregDExternal() << "\t" + << mModelProb->getProbSegregDInternal().printAll() << "\n"; + myfile.close(); } -void hmm::writeLogFile3states(const char* fileName, string logInfo) +void hmm::writeLogFile3states(const char *fileName, string logInfo) { - ofstream myfile; - myfile.open(fileName, std::ios_base::app); - myfile << std::setprecision (15) << logInfo << "\t" - << mHmmResults->mlogLikelihood << "\t" - << mModelProb->getLErate() << "\t" - << mModelProb->getStayInternal() << "\t" - << mModelProb->getStayExternal() << "\t" - << mModelProb->getStayLongExternal() << "\t" - << mModelProb->getProbFixDInternal() << "\t" - << mModelProb->getProbFixDExternal() << "\t" - << mModelProb->getProbSegregDExternal() << "\t" - << mModelProb->getProbSegregDInternal().printAll() - << "\n"; - myfile.close(); + ofstream myfile; + myfile.open(fileName, std::ios_base::app); + myfile << std::setprecision(15) << logInfo << "\t" << mHmmResults->mlogLikelihood << "\t" << mModelProb->getLErate() << "\t" << mModelProb->getStayInternal() << "\t" + << mModelProb->getStayExternal() << "\t" << mModelProb->getStayLongExternal() << "\t" << mModelProb->getProbFixDInternal() << "\t" << mModelProb->getProbFixDExternal() + << "\t" << mModelProb->getProbSegregDExternal() << "\t" << mModelProb->getProbSegregDInternal().printAll() << "\n"; + myfile.close(); } -void hmm::writeOutputFile(void) +void hmm::writeOutputFile(void) { - for(size_t i=1; isize(); ++i) { - cout << mObsSequence->at(i).getChr() << "\t" - << mObsSequence->at(i).getLocation() << "\t" - << mObsSequence->at(i).getClint() << "\t" - << mObsSequence->at(i).getCoverage() << "\t" - << mObsSequence->at(i).getDerived() << "\t" - << mObsSequence->at(i).getUlindi() << "\t" - << mObsSequence->at(i).getDist() << "\t" - << mHmmResults->mExternal[i] << "\t" - << mHmmResults->mfwd_stateE_scaled[i]*mHmmResults->mbwd_stateE_scaled[i] << "\n" ; - } + for (size_t i = 1; i < mObsSequence->size(); ++i) + { + cout << mObsSequence->at(i).getChr() << "\t" << mObsSequence->at(i).getLocation() << "\t" << mObsSequence->at(i).getClint() << "\t" << mObsSequence->at(i).getCoverage() + << "\t" << mObsSequence->at(i).getDerived() << "\t" << mObsSequence->at(i).getUlindi() << "\t" << mObsSequence->at(i).getDist() << "\t" << mHmmResults->mExternal[i] + << "\t" << mHmmResults->mfwd_stateE_scaled[i] * mHmmResults->mbwd_stateE_scaled[i] << "\n"; + } } void hmm::writeOutputFile3states(void) { - for(size_t i=1; isize(); ++i) { - cout << mObsSequence->at(i).getChr() << "\t" - << mObsSequence->at(i).getLocation() << "\t" - << mObsSequence->at(i).getClint() << "\t" - << mObsSequence->at(i).getCoverage() << "\t" - << mObsSequence->at(i).getDerived() << "\t" - << mObsSequence->at(i).getUlindi() << "\t" - << mObsSequence->at(i).getDist() << "\t" - << mHmmResults->mExternal[i] << "\t" - << mHmmResults->mfwd_stateE_scaled[i]*mHmmResults->mbwd_stateE_scaled[i] << "\t" - << mHmmResults->mfwd_stateLE_scaled[i]*mHmmResults->mbwd_stateLE_scaled[i] << "\n" ; - } + for (size_t i = 1; i < mObsSequence->size(); ++i) + { + cout << mObsSequence->at(i).getChr() << "\t" << mObsSequence->at(i).getLocation() << "\t" << mObsSequence->at(i).getClint() << "\t" << mObsSequence->at(i).getCoverage() + << "\t" << mObsSequence->at(i).getDerived() << "\t" << mObsSequence->at(i).getUlindi() << "\t" << mObsSequence->at(i).getDist() << "\t" << mHmmResults->mExternal[i] + << "\t" << mHmmResults->mfwd_stateE_scaled[i] * mHmmResults->mbwd_stateE_scaled[i] << "\t" << mHmmResults->mfwd_stateLE_scaled[i] * mHmmResults->mbwd_stateLE_scaled[i] + << "\n"; + } } diff --git a/hmm.h b/hmm.h index 6f6ceda..002726b 100644 --- a/hmm.h +++ b/hmm.h @@ -1,52 +1,48 @@ +#include #include -#include +#include +#include #include #include -#include #include -#include -#include +#include -#include "obsData.h" -#include "modelprob.h" #include "hmmresults.h" +#include "modelprob.h" +#include "obsData.h" // These should be parameters: //#define EXTERNAL_ULINDI_DERIVED_PROB 0.01 -using namespace std ; +using namespace std; class hmm { - public: - hmm( obsSequence *seq) ; - - void computeFwdBwd(modelProb *prob, hmmResults *result); - void computeFwdBwd3states(modelProb *prob, hmmResults *result); - void writeLogFile(const char* fileName, string logInfo); - void writeLogFile3states(const char* fileName, string logInfo); - void writeOutputFile(void); - void writeOutputFile3states(void); - - double prob_obs_internal( obsSite dat, probSegregDInternal prob ); - double prob_obs_external( obsSite dat ); - - protected: - - obsSequence *mObsSequence; - modelProb *mModelProb; - hmmResults *mHmmResults; - - //temporary hmm results -// vector mfwd_stateE_scaled; -// vector mfwd_stateI_scaled; -// vector mbwd_stateI_scaled; -// vector mbwd_stateE_scaled; -// vector mExternal; - -// double mlogLikelihood; - -} ; - - + public: + hmm(obsSequence *seq); + + void computeFwdBwd(modelProb *prob, hmmResults *result); + void computeFwdBwd3states(modelProb *prob, hmmResults *result); + void writeLogFile(const char *fileName, string logInfo); + void writeLogFile3states(const char *fileName, string logInfo); + void writeOutputFile(void); + void writeOutputFile3states(void); + + double prob_obs_internal(obsSite dat, probSegregDInternal prob); + double prob_obs_external(obsSite dat); + + protected: + obsSequence *mObsSequence; + modelProb *mModelProb; + hmmResults *mHmmResults; + + // temporary hmm results + // vector mfwd_stateE_scaled; + // vector mfwd_stateI_scaled; + // vector mbwd_stateI_scaled; + // vector mbwd_stateE_scaled; + // vector mExternal; + + // double mlogLikelihood; +}; diff --git a/hmmresults.cc b/hmmresults.cc index 58600ed..d6ec711 100644 --- a/hmmresults.cc +++ b/hmmresults.cc @@ -3,4 +3,3 @@ hmmResults::hmmResults() { } - diff --git a/hmmresults.h b/hmmresults.h index b1f56a1..461c1f1 100644 --- a/hmmresults.h +++ b/hmmresults.h @@ -7,7 +7,7 @@ using namespace std; class hmmResults { -public: + public: hmmResults(); vector mfwd_stateE_scaled; vector mfwd_stateLE_scaled; @@ -21,4 +21,3 @@ class hmmResults }; #endif // HMMRESULTS_H - diff --git a/main.cc b/main.cc index 00fa956..1370d51 100644 --- a/main.cc +++ b/main.cc @@ -1,141 +1,142 @@ -#include +#include #include +#include #include -#include +#include #include -#include #include -using namespace std ; +using namespace std; #include "hillclimbing.h" -int main( int argc, const char *argv[] ) +int main(int argc, const char *argv[]) { // Commandline parameters - int length_internal = 0 ; - int length_external = 0 ; - int length_long_external = 0 ; - double LErate = 0. ; - char *configfile = 0 ; - double prob_fixD_internal = 0. ; - double prob_fixD_external = 0. ; - double converThrld = 0.1 ; - bool baum_welch = false ; + int length_internal = 0; + int length_external = 0; + int length_long_external = 0; + double LErate = 0.; + char *configfile = 0; + double prob_fixD_internal = 0.; + double prob_fixD_external = 0.; + double converThrld = 0.1; + bool baum_welch = false; bool nelderMead = false; - int additionalParameters = 0 ; + int additionalParameters = 0; char *logFileName = 0; - double EXTERNAL_ULINDI_DERIVED_PROB = 0.01;// This should be a parameter + double EXTERNAL_ULINDI_DERIVED_PROB = 0.01; // This should be a parameter struct poptOption optionsTable[] = { - { "length_internal", 'L', POPT_ARG_INT, &length_internal, 0, "Average length of internal region", "length" }, - { "length_external", 'l', POPT_ARG_INT, &length_external, 0, "Average length of external region", "length" }, - { "emission_probabilities", 'e', POPT_ARG_STRING, &configfile, 0, "Emission probabilities for internal state by frequency", "file" }, - { "Baum-Welch", 'B', POPT_ARG_DOUBLE, &converThrld, 0, "Baum-Welch algorithm to re-estimate parameters", "value" }, - { "NelderMead", 'N', POPT_ARG_DOUBLE, &converThrld, 0, "NelderMead algorithm", "converThrld" }, - { "prob_fixD_internal", 'F', POPT_ARG_DOUBLE, &prob_fixD_internal, 0, "Probability for sharing fixed derived when in an internal region", "prob" }, - { "prob_fixD_external", 'f', POPT_ARG_DOUBLE, &prob_fixD_external, 0, "Probability for sharing fixed derived when in an external region", "prob" }, - { "length_long_external", 'S', POPT_ARG_INT, &length_long_external, 0, "Average length of long external regions", "length"}, - { "long_external_rate", 'r', POPT_ARG_DOUBLE, &LErate, 0, "Proportion of external regions produced by positive selection", "rate"}, - { "logfile", 'o', POPT_ARG_STRING, &logFileName, 0, "Set log file name", "path/name"}, - POPT_AUTOHELP { NULL, 0, 0, NULL, 0 } //Should we change something here? + {"length_internal", 'L', POPT_ARG_INT, &length_internal, 0, "Average length of internal region", "length"}, + {"length_external", 'l', POPT_ARG_INT, &length_external, 0, "Average length of external region", "length"}, + {"emission_probabilities", 'e', POPT_ARG_STRING, &configfile, 0, "Emission probabilities for internal state by frequency", "file"}, + {"Baum-Welch", 'B', POPT_ARG_DOUBLE, &converThrld, 0, "Baum-Welch algorithm to re-estimate parameters", "value"}, + {"NelderMead", 'N', POPT_ARG_DOUBLE, &converThrld, 0, "NelderMead algorithm", "converThrld"}, + {"prob_fixD_internal", 'F', POPT_ARG_DOUBLE, &prob_fixD_internal, 0, "Probability for sharing fixed derived when in an internal region", "prob"}, + {"prob_fixD_external", 'f', POPT_ARG_DOUBLE, &prob_fixD_external, 0, "Probability for sharing fixed derived when in an external region", "prob"}, + {"length_long_external", 'S', POPT_ARG_INT, &length_long_external, 0, "Average length of long external regions", "length"}, + {"long_external_rate", 'r', POPT_ARG_DOUBLE, &LErate, 0, "Proportion of external regions produced by positive selection", "rate"}, + {"logfile", 'o', POPT_ARG_STRING, &logFileName, 0, "Set log file name", "path/name"}, + POPT_AUTOHELP{NULL, 0, 0, NULL, 0} // Should we change something here? }; poptContext optCon; - optCon = poptGetContext( "sweep_hmm", argc, argv, optionsTable, 0 ) ; - poptSetOtherOptionHelp( optCon, "[OPTIONS]* " ) ; - int rc = poptGetNextOpt( optCon ) ; - if (rc != -1) { - poptPrintUsage( optCon, stderr, 0 ) ; - return 1; + optCon = poptGetContext("sweep_hmm", argc, argv, optionsTable, 0); + poptSetOtherOptionHelp(optCon, "[OPTIONS]* "); + int rc = poptGetNextOpt(optCon); + if (rc != -1) + { + poptPrintUsage(optCon, stderr, 0); + return 1; } // inputfile present? - const char* infile = poptGetArg( optCon ) ; - if (!infile) { - cerr << "Error: need inputfile as argument." << endl; - poptPrintUsage( optCon, stderr, 0 ) ; - return 1; + const char *infile = poptGetArg(optCon); + if (!infile) + { + cerr << "Error: need inputfile as argument." << endl; + poptPrintUsage(optCon, stderr, 0); + return 1; } // all other parameters present? - if ( length_internal <= 0 || length_external <= 0 || !configfile || prob_fixD_internal <= 0. || prob_fixD_external <= 0. ||prob_fixD_internal >= 1. || prob_fixD_external >= 1. ) + if (length_internal <= 0 || length_external <= 0 || !configfile || prob_fixD_internal <= 0. || prob_fixD_external <= 0. || prob_fixD_internal >= 1. || prob_fixD_external >= 1.) { - cerr << "Error: length_internal, length_external, configfile, prob_fixD_internal and prob_fixD_external must be set." << endl ; - poptPrintUsage( optCon, stderr, 0 ) ; - return 1; + cerr << "Error: length_internal, length_external, configfile, prob_fixD_internal and prob_fixD_external must be set." << endl; + poptPrintUsage(optCon, stderr, 0); + return 1; } // Baum-Welch and/or 3-state algorithm turned on? - for(int i=0;i> num_derived ; - double prob_derived ; - ss >> prob_derived ; - if ( num_derived > 1023 ) { - cerr << "Can't deal with coverage > 1023. (Adjust in code observation.cc)" << endl ; - exit( 1 ) ; - } - probs[num_derived] = prob_derived ; - if(num_derived>max_num_derived){ - max_num_derived = num_derived; - } + // probs.resize( 1024, 0. ) ; // assuming this is the limit. + string s; + while (in) + { + getline(in, s); + stringstream ss(s, stringstream::in); + int num_derived; + ss >> num_derived; + double prob_derived; + ss >> prob_derived; + if (num_derived > 1023) + { + cerr << "Can't deal with coverage > 1023. (Adjust in code observation.cc)" << endl; + exit(1); + } + probs[num_derived] = prob_derived; + if (num_derived > max_num_derived) + { + max_num_derived = num_derived; } - probs.resize(max_num_derived+1); + } + probs.resize(max_num_derived + 1); } -double probSegregDInternal::getProb( int num_derived ) +double probSegregDInternal::getProb(int num_derived) { - return probs[num_derived] ; + return probs[num_derived]; } void probSegregDInternal::setProbsAt(int index, double value) { - if(index<=probs.size()) - { - probs[index] = value; - // cerr << "probs[index]" << probs[index] << "index:" << index << endl; - } - else - { - cerr << "Can't setProbsAt index > vector size (observation.cc) size: " << probs.size() << " index: " << index << endl; - exit(1); - } + if (index <= probs.size()) + { + probs[index] = value; + // cerr << "probs[index]" << probs[index] << "index:" << index << endl; + } + else + { + cerr << "Can't setProbsAt index > vector size (observation.cc) size: " << probs.size() << " index: " << index << endl; + exit(1); + } } string probSegregDInternal::printAll(void) { - ostringstream oss; - string myStr; + ostringstream oss; + string myStr; - for(int i=1;i -#include -#include #include +#include #include +#include #include -#include +#include +#include -using namespace std ; +using namespace std; class probSegregDInternal { - public: - probSegregDInternal(); - void load( istream &in ) ; - int getSize(void) {return probs.size();}; - void resize(int value) {probs.resize(value);}; - double getProb( int num_derived ) ; - void setProbsAt(int index, double value); - string printAll(void); - private: - vector probs ; // probabilities: 0 is not used, $1 .. coverage-1$ is for derived alleles 1 to coverage-1. - size_t fixed_coverage ; // used to check whether input-lines are wrong - -} ; + public: + probSegregDInternal(); + void load(istream &in); + int getSize(void) + { + return probs.size(); + }; + void resize(int value) + { + probs.resize(value); + }; + double getProb(int num_derived); + void setProbsAt(int index, double value); + string printAll(void); + + private: + vector probs; // probabilities: 0 is not used, $1 .. coverage-1$ is for derived alleles 1 to coverage-1. + size_t fixed_coverage; // used to check whether input-lines are wrong +}; class modelProb { -public: + public: modelProb(); - char loadProb(const char* fileName); + char loadProb(const char *fileName); + // clang-format off void setStayInternal(double value) {mStayInternal = value;}; void setStayExternal(double value) {mStayExternal = value;}; void setStayLongExternal(double value) {mStayLongExternal = value;}; @@ -55,11 +62,12 @@ class modelProb double getProbFixDInternal(void) {return mProbFixDInternal;}; double getProbFixDExternal(void) {return mProbFixDExternal;}; double getProbSegregDExternal(void) {return mProbSegregDExternal;}; - + // clang-format on + -protected: + protected: probSegregDInternal obs; - //probSegregDInternal backup; //used only in the optimization step to get back to old values if the new ones are not better + // probSegregDInternal backup; //used only in the optimization step to get back to old values if the new ones are not better double mStayInternal; double mStayExternal; @@ -71,7 +79,4 @@ class modelProb }; - - #endif // MODELPROB_H - diff --git a/obsData.cc b/obsData.cc index 4b7525c..7364e2c 100644 --- a/obsData.cc +++ b/obsData.cc @@ -2,29 +2,28 @@ #include "obsData.h" -obsSite::obsSite( ) +obsSite::obsSite() { - mChr = "" ; - mLocation = 0; - mCoverage = 0; - mDerived = 0; - mClint = 0; - mUlindi = 0; - mDist = 0.0; + mChr = ""; + mLocation = 0; + mCoverage = 0; + mDerived = 0; + mClint = 0; + mUlindi = 0; + mDist = 0.0; } -//redefine >> operator -std::istream& operator >>(std::istream& Stream, obsSite& Obj) +// redefine >> operator +std::istream &operator>>(std::istream &Stream, obsSite &Obj) { Stream >> Obj.mChr >> Obj.mLocation >> Obj.mClint >> Obj.mCoverage >> Obj.mDerived >> Obj.mUlindi >> Obj.mDist; return Stream; } -obsSequence::obsSequence( ) +obsSequence::obsSequence() { - } int obsSequence::size(void) @@ -37,25 +36,23 @@ obsSite obsSequence::at(int i) return mSequence[i]; } -char obsSequence::loadSequence(const char* fileName) +char obsSequence::loadSequence(const char *fileName) { - ifstream dataStream( fileName, ifstream::in ); - if ( ! dataStream ) { - cerr << "Error: Can't open inputfile `" << fileName << "'" << endl ; - return 1; - } - while ( dataStream ) - { - string s ; - getline( dataStream, s ) ; - istringstream ss( s ) ; - obsSite tmp; - ss >> tmp; - - if ( tmp.getChr() != "" ) - mSequence.push_back( tmp ) ; - } + ifstream dataStream(fileName, ifstream::in); + if (!dataStream) + { + cerr << "Error: Can't open inputfile `" << fileName << "'" << endl; + return 1; + } + while (dataStream) + { + string s; + getline(dataStream, s); + istringstream ss(s); + obsSite tmp; + ss >> tmp; + + if (tmp.getChr() != "") + mSequence.push_back(tmp); + } } - - - diff --git a/obsData.h b/obsData.h index f8cb2bf..0c1462a 100644 --- a/obsData.h +++ b/obsData.h @@ -1,23 +1,24 @@ #ifndef OBSDATA_H #define OBSDATA_H -#include -#include #include +#include #include #include -#include +#include +#include -using namespace std ; +using namespace std; class obsSite { - friend std::istream& operator >>(std::istream&, obsSite&); + friend std::istream &operator>>(std::istream &, obsSite &); - public: - obsSite() ; - + public: + obsSite(); + + // clang-format off string getChr(void) {return mChr;}; long getLocation(void) {return mLocation;}; int getCoverage(void) {return mCoverage;}; @@ -34,30 +35,31 @@ class obsSite void setClint(char value) {mClint = value;}; void setUlindi(char value) {mUlindi = value;}; void setDist(double value) {mDist = value;}; - - string print(void); + // clang-format on - private: - string mChr ; - long mLocation ; - int mCoverage ; - int mDerived ; - char mClint ; - char mUlindi ; - double mDist ; -} ; + string print(void); + + private: + string mChr; + long mLocation; + int mCoverage; + int mDerived; + char mClint; + char mUlindi; + double mDist; +}; class obsSequence { - public: - obsSequence(); - char loadSequence(const char* fileName); + public: + obsSequence(); + char loadSequence(const char *fileName); - int size(void); - obsSite at(int i); + int size(void); + obsSite at(int i); - protected: - vector mSequence; + protected: + vector mSequence; };