SequenceExtractor.cpp

Go to the documentation of this file.
00001 /*
00002 This file is part of LIA_RAL which is a set of software based on ALIZE
00003 toolkit for speaker recognition. ALIZE toolkit is required to use LIA_RAL.
00004 
00005 LIA_RAL project is a development project was initiated by the computer
00006 science laboratory of Avignon / France (Laboratoire Informatique d'Avignon -
00007 LIA) [http://lia.univ-avignon.fr <http://lia.univ-avignon.fr/>]. Then it
00008 was supported by two national projects of the French Research Ministry:
00009         - TECHNOLANGUE program [http://www.technolangue.net]
00010         - MISTRAL program [http://mistral.univ-avignon.fr]
00011 
00012 LIA_RAL is free software: you can redistribute it and/or modify
00013 it under the terms of the GNU Lesser General Public License as
00014 published by the Free Software Foundation, either version 3 of
00015 the License, or any later version.
00016 
00017 LIA_RAL is distributed in the hope that it will be useful,
00018 but WITHOUT ANY WARRANTY; without even the implied warranty of
00019 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
00020 GNU Lesser General Public License for more details.
00021 
00022 You should have received a copy of the GNU Lesser General Public
00023 License along with LIA_RAL.
00024 If not, see [http://www.gnu.org/licenses/].
00025 
00026 The LIA team as well as the LIA_RAL project team wants to highlight the
00027 limits of voice authentication in a forensic context.
00028 The "Person Authentification by Voice: A Need of Caution" paper
00029 proposes a good overview of this point (cf. "Person
00030 Authentification by Voice: A Need of Caution", Bonastre J.F.,
00031 Bimbot F., Boe L.J., Campbell J.P., Douglas D.A., Magrin-
00032 chagnolleau I., Eurospeech 2003, Genova].
00033 The conclusion of the paper of the paper is proposed bellow:
00034 [Currently, it is not possible to completely determine whether the
00035 similarity between two recordings is due to the speaker or to other
00036 factors, especially when: (a) the speaker does not cooperate, (b) there
00037 is no control over recording equipment, (c) recording conditions are not
00038 known, (d) one does not know whether the voice was disguised and, to a
00039 lesser extent, (e) the linguistic content of the message is not
00040 controlled. Caution and judgment must be exercised when applying speaker
00041 recognition techniques, whether human or automatic, to account for these
00042 uncontrolled factors. Under more constrained or calibrated situations,
00043 or as an aid for investigative purposes, judicious application of these
00044 techniques may be suitable, provided they are not considered as infallible.
00045 At the present time, there is no scientific process that enables one to
00046 uniquely characterize a persones voice or to identify with absolute
00047 certainty an individual from his or her voice.]
00048 
00049 Copyright (C) 2004-2010
00050 Laboratoire d'informatique d'Avignon [http://lia.univ-avignon.fr]
00051 LIA_RAL admin [alize@univ-avignon.fr]
00052 Jean-Francois Bonastre [jean-francois.bonastre@univ-avignon.fr]
00053 */
00054 
00055 #if !defined(ALIZE_SequenceExtractor_cpp)
00056 #define ALIZE_SequenceExtractor_cpp
00057 
00058 #include <iostream>  
00059 #include <fstream>  // pour outFile
00060 #include <cstdio>   // pour printf()
00061 #include <cassert> // pour le debug pratique
00062 #include <cmath>
00063 #include <string>
00064 
00065 #include "liatools.h"
00066 #include "BNGram.h"
00067 #include "SequenceExtractor.h"
00068 
00069  
00070 //*************************************************************************************************************************
00071 //************ Common part tree functions
00072 //************ common part tree is the main class for the ssequence exctraction. It represents the  ngrams in the tree
00073 //************* and implements the sequence extraction functions using this structure
00074 //*************************************************************************************************************************
00075 CommonPartTree::CommonPartTree()
00076 {
00077   _totalCount=0;
00078   _totalChildCount=0;
00079   _seed=NULL;
00080 }
00081 CommonPartTree:: CommonPartTree(const CommonPartTree &obj){
00082 throw Exception("recopy constructor not available"
00083         , __FILE__, __LINE__);
00084 }
00085 CommonPartTree CommonPartTree:: operator=(const CommonPartTree & obj){
00086 throw Exception("= constructor not available"
00087         , __FILE__, __LINE__);
00088 }
00089 
00090 CommonPartTree::~CommonPartTree(){
00091   _freeTree(_seed);
00092 }
00093 void CommonPartTree::_freeTree(CommonPartTreeNode *ptr){
00094   if (ptr==NULL) return;
00095   _freeTree(ptr->br);
00096   _freeTree(ptr->ch);
00097   delete ptr;
00098 }
00099 CommonPartTreeNode* CommonPartTree::_newNode(const short int symb,const unsigned long count,
00100                                             CommonPartTreeNode *ch,CommonPartTreeNode *br){
00101   CommonPartTreeNode *ptr=new CommonPartTreeNode;
00102   ptr->symb=symb;
00103   ptr->count=count;
00104   ptr->totalChildCount=0;
00105   ptr->ch=ch;
00106   ptr->br=br;
00107   return ptr;
00108 }
00109 
00110 CommonPartTreeNode* CommonPartTree::_findInsert(const short int symb,unsigned long count, CommonPartTreeNode * ptr){
00111   if (ptr==NULL)       // We arrived on a leaf
00112     return _newNode(symb,count,NULL,NULL);
00113   if (ptr->symb!=symb) {
00114     CommonPartTreeNode *pTmp=_findInsert(symb,count,ptr->br);
00115     if (ptr->br==NULL) ptr->br=pTmp;
00116     return pTmp;
00117   }
00118   else return ptr;
00119 }
00120 
00121 void CommonPartTree::addNGram(NGram & nGram){
00122   for (unsigned long idx=0; idx<nGram.getSize();idx++){ // For each ngram
00123     unsigned long count=nGram.getCount(idx);
00124     CommonPartTreeNode *currentP=_findInsert(nGram.getSymbol(idx,0),count,_seed);
00125     if (_seed==NULL) _seed=currentP;  // special case for first insertion
00126     CommonPartTreeNode *tmpP=NULL;
00127     for (unsigned long order=1;order<nGram.getOrder();order++){
00128       tmpP=currentP;
00129       currentP=_findInsert(nGram.getSymbol(idx,order),count,currentP->ch);
00130       if (tmpP->ch==NULL) tmpP->ch=currentP;
00131     } // end of ngram loop  
00132     if (nGram.getOrder()==1)  _totalChildCount+=count; // special case for unigrams
00133     else tmpP->totalChildCount+=count;
00134   } // End ngram loop
00135 }
00136 
00137 // findPartSeq finds the path corresponding to the seq in the tree and return the node corresponding to 
00138 // the requested order. Retunr null if no node is available
00139 CommonPartTreeNode * CommonPartTree::_findPartSeq(Seq &seq,short int order,CommonPartTreeNode *ptr){
00140   if (ptr==NULL) return NULL;
00141   if (seq.getLength()==0) return _seed;      
00142   if (order>=seq.getLength()) return NULL;                // 
00143   if (seq[order].isIn(ptr->symb)){ 
00144     if (order==seq.getLength()-1) return ptr;               // We are on the last part of the sequence
00145     else return _findPartSeq(seq,order+1,ptr->ch);
00146   }
00147   else return _findPartSeq(seq,order,ptr->br);
00148 }
00149 
00150 // findMaxSeq search the sequence with the maximum count and length and return the corresponding count
00151 // Could work from scratch or from a current seg and an order where to work
00152 // It launches recursive private functions
00153 
00154 unsigned long CommonPartTree::_findMaxSeq(CommonPartTreeNode *ptr,Seq &seq, short int & order){
00155   if (ptr==NULL){ order=0; return 0;}
00156   short int orderCh=order+1;
00157   short int orderBr=order;
00158   Seq seqBr=seq;
00159   seq.setLength(order+1);seq[order].setSymb(ptr->symb);
00160   unsigned long brCount=_findMaxSeq(ptr->br,seqBr,orderBr);
00161   unsigned long chCount=_findMaxSeq(ptr->ch,seq,orderCh);
00162  
00163   if (orderBr<orderCh){seq[order].setSymb(ptr->symb); order=orderCh;return chCount;}  // child is the longuest sequence and win
00164   // child and/or borther == NULL
00165   if (orderCh==0){
00166     if (orderBr==0) return ptr->count;                                   // Brother==NULL, current is the winner
00167     if (orderBr>order) {order=orderBr; seq=seqBr;return brCount;}       // Brother find the longest sequence
00168     // equal length and child==NULL
00169     if (ptr->count>brCount)  return ptr->count;                              // current is the winner
00170     else {order=orderBr; seq=seqBr;return brCount;}                     // Brother is the winner
00171   }
00172   // child and Brother !=NULL
00173   if (orderBr>orderCh){order=orderBr; seq=seqBr;return brCount;}        // Brother find the longest sequence and win
00174   // Equal length (and child and brother !=NULL)     
00175   if (brCount>(chCount)){order=orderBr;seq=seqBr;return brCount;}       // Brother has the max count and win
00176   else{seq[order].setSymb(ptr->symb);order=orderCh;return chCount;}     // child has the max count and win
00177 }
00178 unsigned long CommonPartTree::findMaxSeq(Seq &seq){
00179   short int order=0; 
00180   if (_seed!=NULL){
00181     return _findMaxSeq(_seed,seq,order);
00182   }
00183   return 0;
00184 }
00185 unsigned long CommonPartTree::findMaxEndSeq(Seq &seq){
00186   if (seq.getLength()==0) return findMaxSeq(seq);
00187   if (_seed!=NULL){
00188     CommonPartTreeNode * ptr=_findPartSeq(seq,0,_seed);
00189     if (ptr==NULL) return 0;
00190     short int order=seq.getLength();
00191     if (ptr->ch != NULL) return _findMaxSeq(ptr->ch,seq,order);
00192     else return ptr->count;
00193   }
00194   return 0;
00195 }
00196 
00197 // It suppress the nodes corresponding to the sequence in the tree - interface function and recursive function
00198 // Take care, a sequence could contain more than one symbol by node...
00199 CommonPartTreeNode* CommonPartTree::_suppressSeq(CommonPartTreeNode * ptr,Seq &seq,
00200                                                 short int order, unsigned long &childCountDelta){
00201   if (order>=seq.getLength())
00202     throw Exception("sequence longer than the tree" , __FILE__, __LINE__);
00203   CommonPartTreeNode* ptrRet=ptr;
00204   CommonPartTreeNode* ptrTmp=ptr;
00205   while ((ptr!=NULL)&& (!seq[order].isIn(ptr->symb))) {ptrTmp=ptr;ptr=ptr->br;}
00206   if (seq[order].isIn(ptr->symb)){      // We are in the good tree branch
00207     if (order==seq.getLength()-1) {     // and at the end of the sequence
00208       childCountDelta=ptr->count;
00209       ptrTmp->br=ptr->br;
00210       if (ptrRet==ptr) ptrRet=ptr->br;
00211       delete ptr; 
00212     }
00213     else{ // We are in the good branch but not at the end of the sequence
00214       ptr->ch=_suppressSeq(ptr->ch,seq,order+1,childCountDelta);  
00215       if (ptr->count<childCountDelta) throw Exception("count problem in the tree, childcount < delta" , __FILE__, __LINE__);
00216       ptr->totalChildCount-=childCountDelta;
00217       ptr->count-=childCountDelta;
00218       if (ptr->count==0) {               // We are in a node with a new count=0, to be destroyed
00219         ptrTmp->br=ptr->br;
00220         if (ptrRet==ptr) ptrRet=ptr->br;
00221       }
00222     }
00223   }
00224   else{ // Branch nor find
00225     childCountDelta=0;
00226     ptrRet= NULL;
00227   }
00228   return ptrRet;
00229 }
00230 
00231 void CommonPartTree::suppressSeq(Seq &seq){
00232   short int order=0;
00233   if (seq.getLength()==0) return;
00234   unsigned long childCountDelta=0;
00235   _seed=_suppressSeq(_seed,seq,order,childCountDelta);
00236   _totalChildCount-=childCountDelta;
00237 }
00238 void CommonPartTree::_show(CommonPartTreeNode *ptr,unsigned long order){
00239   if (ptr==NULL) return;
00240   cout <<"Order["<<order<<"] Symbol["<<ptr->symb<<"] count["<<ptr->count<<"]Childcount["<<ptr->totalChildCount<<"]"<<endl;
00241   _show(ptr->ch,order+1);
00242   _show(ptr->br,order);
00243 }
00244 void CommonPartTree::show(){
00245   cout <<"TotalCount["<<_totalCount<<"]totalChildCount["<<_totalChildCount<<"]"<<endl;
00246   _show(_seed,0);
00247 }
00248 //*************************************************************************************************************************
00249 //************ SymbTab 
00250 //************ SymTab is simply a code for the symbols. It allows to group several input symbols in only one node
00251 //************ It is ssimply an array of booleans. The boolean i says if the input symbol i is present or not in the node
00252 //************ (this multiple symbols functionality is not used currently, implemented for future use)
00253 //*************************************************************************************************************************
00254 void SymbTab::reserveMem(){
00255   _symbTab= new bool[_nbSymb];
00256 }
00257 SymbTab:: SymbTab(const SymbTab &obj){
00258   _nbSymb=obj._nbSymb;
00259   reserveMem();
00260   for (int i=0;i<_nbSymb;i++) _symbTab[i]=obj._symbTab[i];
00261 } 
00262 SymbTab& SymbTab:: operator=(const SymbTab &obj){
00263   if (this!=(&obj)){
00264     if (_symbTab!=NULL) delete [] _symbTab;
00265     _nbSymb=obj._nbSymb;
00266     reserveMem();
00267     for (int i=0;i<_nbSymb;i++) _symbTab[i]=obj._symbTab[i];
00268   }
00269   return (*this);
00270 }
00271 bool SymbTab:: operator==(const SymbTab & obj){
00272   if ((_symbTab==NULL)|| (obj._symbTab==NULL)) throw Exception("symbol array not allowed" , __FILE__, __LINE__);
00273   if (_nbSymb!=obj._nbSymb) throw Exception("symbol arrays differ in size" , __FILE__, __LINE__);
00274   for (int i=0;i<_nbSymb;i++) if (_symbTab[i]!=obj._symbTab[i]) return false;
00275   return true;
00276 }
00277 bool SymbTab:: operator!=(const SymbTab & obj){
00278   if ((_symbTab==NULL)|| (obj._symbTab==NULL)) throw Exception("symbol array not allowed" , __FILE__, __LINE__);
00279   if (_nbSymb!=obj._nbSymb) throw Exception("symbol arrays differ in size" , __FILE__, __LINE__);
00280   for (int i=0;i<_nbSymb;i++) if (_symbTab[i]!=obj._symbTab[i]) return true;
00281   return false;
00282 }
00283 bool SymbTab::isIn(short int symb){
00284   if (_symbTab==NULL) throw Exception("symbol array not allowed" , __FILE__, __LINE__);
00285   if ((symb<0)||(symb>=_nbSymb)){
00286     cout << "symb=["<<symb<<"]"<<endl;
00287     throw Exception("symbol out of boundaries" , __FILE__, __LINE__);
00288   }
00289   return _symbTab[symb];
00290 }
00291 void SymbTab::setSymb(short int symb){
00292   if ((symb<0) || (symb>_nbSymb))throw Exception("index out of bound"
00293                                                  , __FILE__, __LINE__);
00294   _symbTab[symb]=true;
00295 }
00296 void SymbTab::show(){
00297   cout <<"[";
00298   for (int i=0;i<_nbSymb;i++) 
00299     if (_symbTab[i]) cout <<i<<" ";
00300   cout <<"]"<<endl;
00301 }
00302 
00303 //*****************************************************************************************************************************************
00304 //*** ReadMemory groups all the functionalities for reading the input files, in a segmental mode
00305 //*** It reads the symbol one by one in the segment and says when the end of file or the end of segment is reached
00306 //*** It also allows to comeback in the history for the deconding, for syntax analysys using non LL1 grammary (used here
00307 //*** It will allow binary file support asap
00308 //*****************************************************************************************************************************************
00309 
00310 ReadMemory::ReadMemory(String inputFilename,int bufSize=100,unsigned long begin=0,unsigned long length=0){
00311   _inputFile.open(inputFilename.c_str());
00312   _bufSize=bufSize;
00313   _buf=new short int[_bufSize];
00314   _idx=0;
00315   _realIdx=0;
00316   _begin=begin;
00317   _length=length;
00318   _segmental=(length>0);
00319 }
00320 ReadMemory::~ReadMemory(){
00321   delete [] _buf;
00322   _inputFile.close();
00323 }
00324 bool ReadMemory::notEof(){
00325   if (!_segmental) return (!_inputFile.eof());
00326   else return ((_idx<_begin+_length)&&(!_inputFile.eof()));
00327 }
00328 bool ReadMemory::eof(){
00329   return !(notEof());
00330 }
00331 bool ReadMemory::readSymb(short int &symb,string & oov){ 
00332   if (notEof()){
00333     char c;
00334     _inputFile>>c;
00335     while (notEof() && (c<=32)) _inputFile>>c; 
00336     if (notEof()){
00337       _inputFile.putback(c);
00338       if ((c >= '0') && (c <= '9')) _inputFile >> symb;
00339       else{
00340         _inputFile>>oov;
00341         symb=-1;
00342       }
00343       if (debug) cout <<"symbol["<<_idx<<"]=["<<symb<<"]"<<endl;
00344       return true;
00345     }
00346 
00347   }
00348   return false;
00349 }
00350 bool ReadMemory::lecture(bool init=false){
00351   short int tmpS=-1;
00352   string oov;
00353   if ((!init) && (_idx<_realIdx)){
00354     _idx++; 
00355     return true;
00356   }
00357   if ((init)&&(_segmental))
00358     while (_idx<_begin){
00359       if (readSymb(tmpS,oov)){
00360         _idx++;
00361         _realIdx++;
00362         if (tmpS==-1) cerr << "Warning, oov["<<oov<<"] detected and ignored, idx["<<_realIdx<<"]"<<endl;
00363       } else return false;
00364     }
00365   
00366   if (notEof()){
00367     if (!init) {_realIdx++;_idx++;}
00368     while (readSymb(tmpS,oov)){
00369       if (tmpS==-1) cerr << "Warning, oov["<<oov<<"] detected and ignored, idx["<<_realIdx<<"]"<<endl;
00370       else {_buf[_realIdx%_bufSize]=tmpS;return true ;}
00371       _realIdx++;_idx++;
00372     }
00373   }
00374   return false;
00375 }
00376 unsigned long ReadMemory::getIdx(){
00377   return _idx;
00378 }
00379 short int ReadMemory::getCurrentSymb(){
00380   return _buf[_idx%_bufSize];
00381 }
00382 void ReadMemory::setIdx(unsigned long idx){
00383   _idx=idx;
00384 }
00385 
00386 void Seq::_reserveMem(){
00387   _array=new SymbTab[_maxLength]; 
00388   for (int i=0;i<_maxLength;i++){  
00389     _array[i].setNbSymb(_nbInputSymb);
00390     _array[i].reserveMem();
00391   }
00392 }
00393 Seq::Seq(const Seq& obj){
00394   _maxLength=obj._maxLength;
00395   _length=obj._length;
00396   _nbInputSymb=obj._nbInputSymb;   
00397   _reserveMem();
00398   for (int i=0;i<_maxLength;i++) _array[i]=obj._array[i];   
00399 }
00400 Seq::Seq(short int maxLength, short int nbInputSymb){
00401   _maxLength=maxLength;
00402   _nbInputSymb=nbInputSymb;
00403   _length=0;
00404   _reserveMem();    
00405 }
00406 Seq::~Seq(){
00407   delete [] _array;
00408 }
00409 Seq & Seq::operator=(const Seq& obj){
00410   if (this !=(&obj)){
00411     delete [] _array;
00412     _maxLength=obj._maxLength;
00413     _nbInputSymb=obj._nbInputSymb;   
00414     _length=obj._length;
00415     _reserveMem();
00416     for (int i=0;i<_maxLength;i++) _array[i]=obj._array[i];   
00417   }
00418   return *(this);
00419 }
00420 SymbTab& Seq::operator[](short int idx){
00421   if (idx>=_maxLength) throw Exception("index > sequence length"
00422                                       , __FILE__, __LINE__);
00423   return _array[idx];
00424 }
00425 void Seq::setLength(short int length){
00426   if (length>_maxLength) throw Exception("index > sequence length", __FILE__, __LINE__);
00427   _length=length;
00428 }
00429 short int Seq::getLength(){
00430   return _length;
00431 }
00432 void Seq::init(short int length=0){
00433   if ((length>_maxLength)|| (length<0)) throw Exception("index > max sequence length" , __FILE__, __LINE__);
00434   for (int i=length;i<_maxLength;i++)
00435     _array[i].init(false);
00436   _length=length;
00437 }
00438 void Seq::show(){
00439   cout <<"sequence"<<endl;
00440   for (int i=0;i<_length;i++){
00441     cout <<"["<<i<<"]";_array[i].show();}
00442 }
00443 // Add a symbole at the end of the sequence
00444 void Seq:: add(SymbTab *ptr){   
00445   if (_length==_maxLength) throw Exception("out of bound" , __FILE__, __LINE__);
00446   _array[_length]=(*ptr);
00447   _length++;
00448 }
00449 
00450 
00451 // SEQUENCE DECODER
00452 SequenceDecoder::SequenceDecoder(short int nbInputSymb){          
00453   _nbOutputSeq=0;
00454   _nbOutputSeqPart=0;
00455   _nbInputSymb=nbInputSymb;
00456   _seed=NULL;
00457 }
00458 SequenceDecoder:: SequenceDecoder(const SequenceDecoder &obj){
00459 throw Exception("recopy constructor not available"
00460         , __FILE__, __LINE__);
00461 }
00462 SequenceDecoder SequenceDecoder:: operator=(const SequenceDecoder & obj){
00463 throw Exception("= constructor not available"
00464         , __FILE__, __LINE__);
00465 }
00466 SequenceDecoder::~SequenceDecoder(){
00467   _freeTree(_seed);
00468 }
00469 void SequenceDecoder::_freeTree(SequenceDecoderNode *seed){
00470   if (seed==NULL) return;
00471   _freeTree(seed->br);
00472   _freeTree(seed->ch);
00473   delete seed->symbTab;
00474   delete seed;
00475 }
00476 SequenceDecoderNode* SequenceDecoder::_newNode(const SymbTab &symbTab,const short int outputSymb,
00477                                               SequenceDecoderNode *ch,SequenceDecoderNode *br){
00478   SequenceDecoderNode *ptr=new SequenceDecoderNode;
00479   ptr->symbTab=new SymbTab(symbTab);
00480   ptr->outputSymb=outputSymb;
00481   ptr->ch=ch;
00482   ptr->br=br;
00483   return ptr;
00484 }
00485 SequenceDecoderNode* SequenceDecoder::_findInsert(const SymbTab &symbTab,SequenceDecoderNode *ptr){
00486   if (ptr==NULL) return _newNode(symbTab,-1,NULL,NULL); // We arrived on a leaf
00487   while ((ptr->br!=NULL) && (*(ptr->symbTab)!=symbTab)) ptr=ptr->br;
00488   if (*(ptr->symbTab)==symbTab) return ptr;
00489   else{
00490     ptr->br=_newNode(symbTab,-1,NULL,NULL);
00491     return ptr->br;
00492   }
00493 }
00494 
00495 void SequenceDecoder::addSequence(Seq &sequence, const short int outputSymb){
00496   if (sequence.getLength()==0) throw Exception(" Try to add a null length sequence" , __FILE__, __LINE__); 
00497   SequenceDecoderNode *currentP=_findInsert(sequence[0],_seed);
00498   if (_seed==NULL) _seed=currentP;
00499   for (int step=1;step<sequence.getLength();step++){  // For each of the other steps in the sequence
00500     SequenceDecoderNode* tmpP=_findInsert(sequence[step],currentP->ch);
00501     if (currentP->ch==NULL)currentP->ch=tmpP;
00502     currentP=tmpP;
00503   } 
00504   if (currentP->outputSymb != -1) throw Exception("Try to add a sequence on an existing one" , __FILE__, __LINE__); 
00505   currentP->outputSymb=outputSymb;
00506   _nbOutputSeqPart++;
00507 }
00508 void SequenceDecoder::show(){
00509   _show(_seed,0);
00510 }
00511 void SequenceDecoder::_show(SequenceDecoderNode *ptr,short int order){
00512   if (ptr==NULL) return;
00513   cout <<"Order["<<order<<"]"<<endl;
00514   (*ptr->symbTab).show();
00515   if (ptr->outputSymb!=-1) cout<<"outputSymb["<<ptr->outputSymb<<"]"<<endl;
00516   _show(ptr->ch,order+1);
00517   _show(ptr->br,order);
00518 }
00519 
00520 void SequenceDecoder::_toFile(SequenceDecoderNode *ptr,short int symb,Seq & currentSeq,ostream & outputFile){
00521   if (ptr==NULL)return;
00522   currentSeq.add(ptr->symbTab);
00523   if (ptr->outputSymb==symb){ // We find a sequence corresponding to the current output symbol
00524     outputFile << "Symb["<<symb<<"]=";
00525     for (int i=0; i<currentSeq.getLength();i++){
00526       outputFile<<"{";
00527       for (int s=0;s<currentSeq[i].getNbSymb();s++)
00528         if (currentSeq[i].isIn(s)) outputFile <<s<<" ";
00529       outputFile<<"}";
00530     }
00531     outputFile <<endl;
00532   }
00533   _toFile(ptr->ch,symb,currentSeq,outputFile);
00534   currentSeq.init(currentSeq.getLength()-1);
00535   _toFile(ptr->br,symb,currentSeq,outputFile);
00536 }
00537 void SequenceDecoder::toFile(ostream & outputFile){
00538   for(unsigned long int i=0; i<_nbOutputSeq;i++){
00539     Seq currentSeq(100,_nbInputSymb); 
00540     currentSeq.init(0); 
00541     _toFile(_seed,i,currentSeq,outputFile);
00542   }
00543 }
00544 SequenceDecoderNode * SequenceDecoder::_load(istream &inputFile){
00545   string tmp;
00546   if (inputFile.eof()) return NULL;
00547   inputFile>>tmp;
00548   if (tmp=="nil") return NULL;
00549   SequenceDecoderNode *ptr=NULL;
00550   SequenceDecoderNode* retPtr=NULL;
00551   bool init=true;
00552   if (tmp=="begin"){
00553     while (tmp!="nil"){
00554       SymbTab sym(_nbInputSymb);
00555       SequenceDecoderNode * newPtr=_newNode(sym,-1,NULL,NULL);
00556       newPtr->ch=_load(inputFile);
00557       inputFile>>newPtr->outputSymb;
00558       short int s;
00559       do{
00560         inputFile>>s;
00561         if (s!=-1)newPtr->symbTab->setSymb(s);
00562       } while (s!=-1);
00563       if (init) {retPtr=newPtr;ptr=newPtr;init=false;}
00564       else{ptr->br=newPtr;ptr=ptr->br;}
00565       inputFile>>tmp;
00566     }
00567     ptr->br=NULL;
00568     return retPtr;
00569   }
00570   else throw Exception("nil or eof is missing" , __FILE__, __LINE__); 
00571 }
00572 void SequenceDecoder::load(istream &inputFile){
00573   inputFile>>_nbInputSymb;
00574   inputFile>>_nbOutputSeqPart;
00575   inputFile>> _nbOutputSeq;
00576   _seed=_load(inputFile);
00577 }
00578 void SequenceDecoder::_save(SequenceDecoderNode *ptr,ostream & outputFile){
00579   if (ptr==NULL) outputFile<<"nil"<<endl;
00580   else{
00581     while (ptr!=NULL){
00582       outputFile<<"begin"<<endl;
00583       _save(ptr->ch,outputFile);
00584       outputFile<<ptr->outputSymb<<" ";
00585       for (int s=0;s<ptr->symbTab->getNbSymb();s++)
00586         if (ptr->symbTab->isIn(s)) outputFile<<s<<" ";
00587       outputFile<<"-1"<<endl;
00588       ptr=ptr->br;
00589     }
00590     outputFile<<"nil"<<endl;
00591   }
00592 }
00593 void SequenceDecoder::save(ostream & outputFile){
00594   outputFile<<_nbInputSymb<<endl;
00595   outputFile<<_nbOutputSeqPart<<endl;;
00596   outputFile<< _nbOutputSeq<<endl;
00597   _save(_seed,outputFile);
00598 }
00599 // _decode() finds one sequence. currentIdx is on the first symbol to decode. At the end, the input stream is on the next symbol to decode
00600 bool SequenceDecoder::_decode(SequenceDecoderNode *ptr,
00601                               ReadMemory & inp,ostream & outputFile,unsigned long &begin,short int &decodedSeq){
00602   if (debug) cout << "_decode["<<inp.getIdx()<<"]=["<<inp.getCurrentSymb()<<"]"<<endl;
00603   while ((ptr!=NULL)&&(!ptr->symbTab->isIn(inp.getCurrentSymb()))) ptr=ptr->br;
00604   if (ptr==NULL) return false;
00605   
00606   if (ptr->ch==NULL){ //end of the sequence
00607     if (debug) cout <<"end of sequence detected idx["<<inp.getIdx()<<"]"<<endl;
00608     if (ptr->outputSymb==-1) cerr<< "Warning: not ended sequence at idx["<<inp.getIdx()<<"]"<<endl;
00609     else {outputFile <<begin<<" "<<inp.getIdx()<<" "<<ptr->outputSymb<<endl;decodedSeq=ptr->outputSymb;};
00610     inp.lecture();
00611     begin=inp.getIdx();
00612     return true;
00613   }
00614   
00615   if (ptr->outputSymb==-1){ // We are in the sequence but no output at this level: go at the next level
00616     if (debug) cout <<"no output at this level idx["<<inp.getIdx()<<"], go to the next level"<<endl;
00617     if (!inp.lecture()) return false;
00618     return _decode(ptr->ch,inp,outputFile,begin,decodedSeq);
00619   }
00620   // If eof, we output the current solution
00621   if (inp.eof()) {
00622     outputFile <<begin<<" "<<inp.getIdx()<<" "<<ptr->outputSymb<<endl;decodedSeq=ptr->outputSymb;
00623     //  inp.lecture();
00624     // begin=inp.getIdx();
00625     return true;
00626   }
00627   // Now, we have at least one solution, maybe several..., we have to select the longuest one
00628   unsigned long idx=inp.getIdx();
00629   if (debug) cout <<"save idx["<<idx<<"]symb="<<inp.getCurrentSymb()<<endl;
00630   inp.lecture();
00631   if (inp.eof() || (!_decode(ptr->ch,inp,outputFile,begin,decodedSeq))){   // We didn't find a more longer sequence
00632     outputFile <<begin<<" "<<idx<<" "<<ptr->outputSymb<<endl;decodedSeq=ptr->outputSymb;
00633     inp.setIdx(idx);                             // Reseting the input stream at the last good value
00634     if (debug) cout << "comeback to idx["<<idx<<"]symb="<<inp.getCurrentSymb()<<endl;
00635     inp.lecture();
00636     begin=inp.getIdx();
00637   }
00638   return true; 
00639 }
00640 
00641 void SequenceDecoder::decode(String inputFilename,ostream & outputFile,unsigned long segBegin,unsigned long segLength,
00642                              bool overlapMode,bool ngramMode,BNgram &ngram) {
00643   ReadMemory inp(inputFilename,100,segBegin,segLength); // 100 is the buffer size, should be >= the max sequence length. 
00644                                                         // if length==0 -> segmental mode not used
00645   if (verbose) cout <<"Begin decoding for seg["<<segBegin<<","<<segLength<<"] overlap:"<<overlapMode<<endl;
00646   if (!inp.lecture(true)) return;                       // true is used only for the first call, false by default
00647   unsigned long begin=inp.getIdx();
00648   bool flag=true;
00649   while (inp.notEof()){
00650     unsigned long saveIdx=inp.getIdx();
00651     short int seq;            // decoded sequence
00652     bool ret=_decode(_seed,inp,outputFile,begin,seq);
00653     if (!ret){ 
00654       cout <<"WARNING, Seq unknown beginning by symb["<<inp.getCurrentSymb()<<"]idx["<<inp.getIdx()<<"]"<<endl;
00655       if (!inp.lecture()) return;
00656       begin=inp.getIdx();
00657     }
00658     if (ngramMode) if (flag){ ngram.beginSeq(seq); flag=false;} else ngram.addSymb(seq);
00659     if (overlapMode){ // We want sequences with a max overlap i.e. a sequence is researched for each input symbol
00660       inp.setIdx(saveIdx);
00661       if (verbose) cout <<"overlapMode, comeback to symb idx:"<<saveIdx<<endl;
00662       if (!inp.lecture()) return;
00663       begin=inp.getIdx();
00664     }
00665     if (verbose) cout <<"decode new sequence at idx["<<inp.getIdx()<<"]"<<endl;
00666   }  
00667   ngram.endSeq();
00668 }
00669 
00670 
00671 //*************************************************************************************************************************
00672 //*****           Main functions (interface)
00673 //**************************************************************************************************************************
00674 
00675 // sequenceDecoder - Using a sequence decoder tree, this function reads a sequence of input symbols and transcodes it, 
00676 // producing a list of labels begin, end (in frames) sequence number
00677 int sequenceDecoder(Config &config){
00678   String inputTreeFilename=config.getParam("decoderFilename");  // The complete filename for the decoder tree
00679   String outputFilename=config.getParam("outputFilename");      // The complete output filename
00680   String inputFilename=config.getParam("inputFilename");        // The complete input filename 
00681   String labelSelectedFrames;
00682   String labelFilename;
00683   bool label=false;
00684   if (config.existsParam("labelFilename")){
00685     labelFilename=config.getParam("labelFilename");
00686     labelSelectedFrames=config.getParam("labelSelectedFrames");              // label for selected frames - Only the frames from segments 
00687     label=true;
00688   }
00689   bool overlapMode=false;
00690   if (config.existsParam("overlapMode")) overlapMode=config.getParam("overlapMode").toBool();
00691   bool ngramMode=false;
00692   String nOutputF;
00693   unsigned long oNgramOrder=0;
00694   if (config.existsParam("ngramOutputFilename")){
00695     ngramMode=true;nOutputF=config.getParam("ngramOutputFilename");
00696     oNgramOrder=config.getParam("ngramOutputOrder").toLong();
00697   }
00698   BNgram outNgram(oNgramOrder,100);
00699   SequenceDecoder decTree(0);
00700   ifstream inputTreeFile(inputTreeFilename.c_str());
00701   decTree.load(inputTreeFile);
00702   ofstream outputFile(outputFilename.c_str(),ios::out| ios::trunc);
00703   if (label){
00704     SegServer segmentsServer;                                     // Create the segment server for managing the segments/clusters
00705     LabelServer labelServer;                                      // Create the label server for managing the segment/cluster names
00706     loadClusterFile(labelFilename,segmentsServer,labelServer,config); // load the label file
00707     long codeSelectedFrame=labelServer.getLabelIndexByString(labelSelectedFrames);   // Get the index of the cluster with in interest audio segments
00708     if (codeSelectedFrame==-1) throw Exception("No segment with ["+labelSelectedFrames+"] label" , __FILE__, __LINE__);
00709     SegCluster& selectedSegments=segmentsServer.getCluster(codeSelectedFrame); // Gives the cluster of the selected/used segments   
00710     Seg* seg;                                                     // reset the reader at the begin of the input stream
00711     selectedSegments.rewind(); 
00712     while((seg=selectedSegments.getSeg())!=NULL){                 // For each of the selected segments
00713       decTree.decode(inputFilename,outputFile,seg->begin(),seg->length(),overlapMode,ngramMode,outNgram); // decode a given segment
00714     }
00715   }
00716   else decTree.decode(inputFilename,outputFile,0,0,overlapMode,ngramMode,outNgram); // all the frames of the file
00717   if (ngramMode){
00718     ofstream outputFileNgram(nOutputF.c_str(),ios::out| ios::trunc);
00719     outNgram.show(outputFileNgram);
00720   }
00721   
00722   return 0;
00723 }
00724 // sequence extractor builds a common part tree defining a set of variable length symbols with the as same 
00725 // as possible probabilities to occur.
00726 // the input data are the grams, from 1 to maxOrder. Each contains the observed ngram with the corresponding probability
00727 // the required number of symbols (nbSym) should also be defined a priori. It gives pSym=1/nbSym, the target probability for each symbol
00728 // The algo consists in selecting the maximum length sequence (S0,S1,..,Sn) with the maximum probability and to aglomerate it with common 
00729 // part sequences until pSym is reached. I.E, making a symbol by combining (a,b,c,d) with (a,b,c,e) (a,b,c,f)...
00730 // The variable length symbol is then inserted in a common part tree: the result !
00731 // the algo is repeated until the number of symbol is reached
00732 int sequenceExtractor(Config& config){
00733 
00734   int maxOrder=config.getParam("maxOrder").toLong();              // Getting the order of ngrams. Ngrams from 1 to maxOrder should be present
00735   unsigned long maxNgram=config.getParam("maxNgram").toLong();    // Max ngram by file
00736   int nbSymb=config.getParam("nbInputSymb").toLong();             // Number of input symbols
00737   const int nbSeq=config.getParam("nbOutputSymb").toLong();       // Number of sequence to detect
00738   bool equalInputInfo=false;
00739   if (config.existsParam("equalInputInfo")) equalInputInfo=config.getParam("equalInputInfo").toBool(); // Work with count*size of the sequence
00740   String ngramFilename=config.getParam("ngramFilename");          // Base filename for the ngram. No order filename have the form ngramFilenameN.ext 
00741   String ngramExt=config.getParam("ngramExt");                    // Ext for the ngram files, should include the .
00742   String outputFilenameInfo=config.getParam("outputInfoFilename");        // The complete output filename for info file
00743   ofstream outputFileInfo(outputFilenameInfo.c_str(),ios::out| ios::trunc);
00744   String outputFilename=config.getParam("outputFilename");        // The complete output filename for the decoder tree
00745   ofstream outputFile(outputFilename.c_str(),ios::out| ios::trunc);
00746  
00747   CommonPartTree initialTree;                             // The tree with all the info  
00748   if (verbose) cout << "reading the ngram files"<<endl;
00749   for (int order=1;order<=maxOrder;order++){              // Load the ngrams, put it in a common part tree
00750     NGram nGram(order,maxNgram);
00751     nGram.load(ngramFilename+String::valueOf(order)+ngramExt,config);
00752     if (debug) nGram.showTable();
00753     initialTree.addNGram(nGram);                        // insert all the ngram in the tree 
00754   }
00755   initialTree.setTotalCount(initialTree.getTotalChildCount());  // All the symbols are seen in the unigrams
00756   if (verbose) cout <<"total input symbols:"<<initialTree.getTotalChildCount()<<endl;
00757   if (debug) {cout<<"initial tree"<<endl; initialTree.show();}
00758 
00759   // Initialize the tree 
00760   SequenceDecoder seqTree(nbSymb);
00761   Seq currentSeq(maxOrder,nbSymb); 
00762   // Begin the main loop
00763   unsigned long remainingInputSymb=initialTree.getTotalChildCount();
00764   bool fEnd=false;
00765   for (int seq=0;(seq<nbSeq) && (!fEnd);seq++){ // Main sequence loop - a sequence is decided at each iteration
00766     if (verbose) cout << "begin of the main sequence detection loop, seq:"<<seq<<endl;
00767     unsigned long targetSeqCount=remainingInputSymb/(nbSeq-seq);                  // Count for each variable length symbol
00768     if (verbose) cout << "totalchildcount"<<initialTree.getTotalChildCount()<<" Remaining input symb:"<<remainingInputSymb<<endl;
00769     if (verbose) cout <<"target Seq count["<<targetSeqCount<<"]"<<endl;
00770     currentSeq.init(); // reset the current seq
00771     unsigned long seqCount=initialTree.findMaxSeq(currentSeq); // Finding the maximum length sequence with the max count 
00772     if (equalInputInfo) seqCount*=currentSeq.getLength();      // If you want to take into account the number of input symbols and not only the count
00773     if (currentSeq.getLength()==0) fEnd=true;
00774     else{ 
00775       if (verbose){
00776         cout <<"First Seq["<<seq<<"] Length["<<currentSeq.getLength();
00777         if (equalInputInfo)cout<<"] Seq input symbols count[";
00778         else cout <<"] SeqCount[";
00779         cout<<seqCount<<"]"<<endl;currentSeq.show();
00780       }
00781       initialTree.suppressSeq(currentSeq); // suppress the sequence in the initialTree
00782       if (debug){ cout <<"tree after suppress"<<endl; initialTree.show();}
00783       seqTree.addSequence(currentSeq,seq);
00784       if (debug) {cout <<"seq tree after adding initial sequence"<<endl;seqTree.show();}
00785       short int length=currentSeq.getLength()-1;// -1 because we want to suppress the last level
00786       while ((seqCount<targetSeqCount)&&(length>=0)){
00787         bool end=false;
00788         while ((!end)&&(length>=0)&&(seqCount<targetSeqCount)){
00789           currentSeq.init(length);                             // withdrath the info of the end of the sequence
00790           if (debug) {cout <<"current path"<<endl;currentSeq.show();}
00791           unsigned long deltaSeqCount=initialTree.findMaxEndSeq(currentSeq);
00792           if (equalInputInfo) deltaSeqCount*=currentSeq.getLength();                       // If you want to take into account the number of input symbols and not only the count
00793           end=(deltaSeqCount==0)||(currentSeq.getLength()==0);
00794           if (!end){
00795             seqCount+=deltaSeqCount;
00796             if (verbose){
00797               cout <<"Seq["<<seq<<"] add new seq Length["<<currentSeq.getLength();
00798               if (equalInputInfo)cout<<"] Seq input symbols count[";
00799               else cout <<"] SeqCount[";
00800               cout<<seqCount<<"]"<<endl;currentSeq.show();
00801             }
00802             initialTree.suppressSeq(currentSeq); // suppress the sequence in the initialTree
00803             if (debug){ cout <<"tree after suppress seq"<<endl; initialTree.show();}
00804             seqTree.addSequence(currentSeq,seq);
00805             if (debug){cout <<"seq tree after adding seq"<<endl;seqTree.show();}
00806             length=currentSeq.getLength()-1;
00807           }
00808           else length--;
00809         }
00810       }
00811       remainingInputSymb-=seqCount;
00812       if (seqCount==0) fEnd=true;
00813       else{
00814         outputFileInfo<<"Sequence["<<seq<<"] Count["<<seqCount<<"]"<<endl;
00815         seqTree.setNbOutputSeq(seq+1);
00816       }
00817     }
00818   } 
00819   
00820   if (verboseLevel>1){cout <<"FINAL SEQUENCE TREE"<<endl; seqTree.show();}
00821   if (verbose) seqTree.toFile(outputFileInfo);
00822   seqTree.save(outputFile);
00823  
00824   return 0;
00825 }
00826 
00827 #endif