00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042
00043
00044
00045
00046
00047
00048
00049
00050
00051
00052
00053
00054
00055 #if !defined(ALIZE_SequenceExtractor_cpp)
00056 #define ALIZE_SequenceExtractor_cpp
00057
00058 #include <iostream>
00059 #include <fstream>
00060 #include <cstdio>
00061 #include <cassert>
00062 #include <cmath>
00063 #include <string>
00064
00065 #include "liatools.h"
00066 #include "BNGram.h"
00067 #include "SequenceExtractor.h"
00068
00069
00070
00071
00072
00073
00074
00075 CommonPartTree::CommonPartTree()
00076 {
00077 _totalCount=0;
00078 _totalChildCount=0;
00079 _seed=NULL;
00080 }
00081 CommonPartTree:: CommonPartTree(const CommonPartTree &obj){
00082 throw Exception("recopy constructor not available"
00083 , __FILE__, __LINE__);
00084 }
00085 CommonPartTree CommonPartTree:: operator=(const CommonPartTree & obj){
00086 throw Exception("= constructor not available"
00087 , __FILE__, __LINE__);
00088 }
00089
00090 CommonPartTree::~CommonPartTree(){
00091 _freeTree(_seed);
00092 }
00093 void CommonPartTree::_freeTree(CommonPartTreeNode *ptr){
00094 if (ptr==NULL) return;
00095 _freeTree(ptr->br);
00096 _freeTree(ptr->ch);
00097 delete ptr;
00098 }
00099 CommonPartTreeNode* CommonPartTree::_newNode(const short int symb,const unsigned long count,
00100 CommonPartTreeNode *ch,CommonPartTreeNode *br){
00101 CommonPartTreeNode *ptr=new CommonPartTreeNode;
00102 ptr->symb=symb;
00103 ptr->count=count;
00104 ptr->totalChildCount=0;
00105 ptr->ch=ch;
00106 ptr->br=br;
00107 return ptr;
00108 }
00109
00110 CommonPartTreeNode* CommonPartTree::_findInsert(const short int symb,unsigned long count, CommonPartTreeNode * ptr){
00111 if (ptr==NULL)
00112 return _newNode(symb,count,NULL,NULL);
00113 if (ptr->symb!=symb) {
00114 CommonPartTreeNode *pTmp=_findInsert(symb,count,ptr->br);
00115 if (ptr->br==NULL) ptr->br=pTmp;
00116 return pTmp;
00117 }
00118 else return ptr;
00119 }
00120
00121 void CommonPartTree::addNGram(NGram & nGram){
00122 for (unsigned long idx=0; idx<nGram.getSize();idx++){
00123 unsigned long count=nGram.getCount(idx);
00124 CommonPartTreeNode *currentP=_findInsert(nGram.getSymbol(idx,0),count,_seed);
00125 if (_seed==NULL) _seed=currentP;
00126 CommonPartTreeNode *tmpP=NULL;
00127 for (unsigned long order=1;order<nGram.getOrder();order++){
00128 tmpP=currentP;
00129 currentP=_findInsert(nGram.getSymbol(idx,order),count,currentP->ch);
00130 if (tmpP->ch==NULL) tmpP->ch=currentP;
00131 }
00132 if (nGram.getOrder()==1) _totalChildCount+=count;
00133 else tmpP->totalChildCount+=count;
00134 }
00135 }
00136
00137
00138
00139 CommonPartTreeNode * CommonPartTree::_findPartSeq(Seq &seq,short int order,CommonPartTreeNode *ptr){
00140 if (ptr==NULL) return NULL;
00141 if (seq.getLength()==0) return _seed;
00142 if (order>=seq.getLength()) return NULL;
00143 if (seq[order].isIn(ptr->symb)){
00144 if (order==seq.getLength()-1) return ptr;
00145 else return _findPartSeq(seq,order+1,ptr->ch);
00146 }
00147 else return _findPartSeq(seq,order,ptr->br);
00148 }
00149
00150
00151
00152
00153
00154 unsigned long CommonPartTree::_findMaxSeq(CommonPartTreeNode *ptr,Seq &seq, short int & order){
00155 if (ptr==NULL){ order=0; return 0;}
00156 short int orderCh=order+1;
00157 short int orderBr=order;
00158 Seq seqBr=seq;
00159 seq.setLength(order+1);seq[order].setSymb(ptr->symb);
00160 unsigned long brCount=_findMaxSeq(ptr->br,seqBr,orderBr);
00161 unsigned long chCount=_findMaxSeq(ptr->ch,seq,orderCh);
00162
00163 if (orderBr<orderCh){seq[order].setSymb(ptr->symb); order=orderCh;return chCount;}
00164
00165 if (orderCh==0){
00166 if (orderBr==0) return ptr->count;
00167 if (orderBr>order) {order=orderBr; seq=seqBr;return brCount;}
00168
00169 if (ptr->count>brCount) return ptr->count;
00170 else {order=orderBr; seq=seqBr;return brCount;}
00171 }
00172
00173 if (orderBr>orderCh){order=orderBr; seq=seqBr;return brCount;}
00174
00175 if (brCount>(chCount)){order=orderBr;seq=seqBr;return brCount;}
00176 else{seq[order].setSymb(ptr->symb);order=orderCh;return chCount;}
00177 }
00178 unsigned long CommonPartTree::findMaxSeq(Seq &seq){
00179 short int order=0;
00180 if (_seed!=NULL){
00181 return _findMaxSeq(_seed,seq,order);
00182 }
00183 return 0;
00184 }
00185 unsigned long CommonPartTree::findMaxEndSeq(Seq &seq){
00186 if (seq.getLength()==0) return findMaxSeq(seq);
00187 if (_seed!=NULL){
00188 CommonPartTreeNode * ptr=_findPartSeq(seq,0,_seed);
00189 if (ptr==NULL) return 0;
00190 short int order=seq.getLength();
00191 if (ptr->ch != NULL) return _findMaxSeq(ptr->ch,seq,order);
00192 else return ptr->count;
00193 }
00194 return 0;
00195 }
00196
00197
00198
00199 CommonPartTreeNode* CommonPartTree::_suppressSeq(CommonPartTreeNode * ptr,Seq &seq,
00200 short int order, unsigned long &childCountDelta){
00201 if (order>=seq.getLength())
00202 throw Exception("sequence longer than the tree" , __FILE__, __LINE__);
00203 CommonPartTreeNode* ptrRet=ptr;
00204 CommonPartTreeNode* ptrTmp=ptr;
00205 while ((ptr!=NULL)&& (!seq[order].isIn(ptr->symb))) {ptrTmp=ptr;ptr=ptr->br;}
00206 if (seq[order].isIn(ptr->symb)){
00207 if (order==seq.getLength()-1) {
00208 childCountDelta=ptr->count;
00209 ptrTmp->br=ptr->br;
00210 if (ptrRet==ptr) ptrRet=ptr->br;
00211 delete ptr;
00212 }
00213 else{
00214 ptr->ch=_suppressSeq(ptr->ch,seq,order+1,childCountDelta);
00215 if (ptr->count<childCountDelta) throw Exception("count problem in the tree, childcount < delta" , __FILE__, __LINE__);
00216 ptr->totalChildCount-=childCountDelta;
00217 ptr->count-=childCountDelta;
00218 if (ptr->count==0) {
00219 ptrTmp->br=ptr->br;
00220 if (ptrRet==ptr) ptrRet=ptr->br;
00221 }
00222 }
00223 }
00224 else{
00225 childCountDelta=0;
00226 ptrRet= NULL;
00227 }
00228 return ptrRet;
00229 }
00230
00231 void CommonPartTree::suppressSeq(Seq &seq){
00232 short int order=0;
00233 if (seq.getLength()==0) return;
00234 unsigned long childCountDelta=0;
00235 _seed=_suppressSeq(_seed,seq,order,childCountDelta);
00236 _totalChildCount-=childCountDelta;
00237 }
00238 void CommonPartTree::_show(CommonPartTreeNode *ptr,unsigned long order){
00239 if (ptr==NULL) return;
00240 cout <<"Order["<<order<<"] Symbol["<<ptr->symb<<"] count["<<ptr->count<<"]Childcount["<<ptr->totalChildCount<<"]"<<endl;
00241 _show(ptr->ch,order+1);
00242 _show(ptr->br,order);
00243 }
00244 void CommonPartTree::show(){
00245 cout <<"TotalCount["<<_totalCount<<"]totalChildCount["<<_totalChildCount<<"]"<<endl;
00246 _show(_seed,0);
00247 }
00248
00249
00250
00251
00252
00253
00254 void SymbTab::reserveMem(){
00255 _symbTab= new bool[_nbSymb];
00256 }
00257 SymbTab:: SymbTab(const SymbTab &obj){
00258 _nbSymb=obj._nbSymb;
00259 reserveMem();
00260 for (int i=0;i<_nbSymb;i++) _symbTab[i]=obj._symbTab[i];
00261 }
00262 SymbTab& SymbTab:: operator=(const SymbTab &obj){
00263 if (this!=(&obj)){
00264 if (_symbTab!=NULL) delete [] _symbTab;
00265 _nbSymb=obj._nbSymb;
00266 reserveMem();
00267 for (int i=0;i<_nbSymb;i++) _symbTab[i]=obj._symbTab[i];
00268 }
00269 return (*this);
00270 }
00271 bool SymbTab:: operator==(const SymbTab & obj){
00272 if ((_symbTab==NULL)|| (obj._symbTab==NULL)) throw Exception("symbol array not allowed" , __FILE__, __LINE__);
00273 if (_nbSymb!=obj._nbSymb) throw Exception("symbol arrays differ in size" , __FILE__, __LINE__);
00274 for (int i=0;i<_nbSymb;i++) if (_symbTab[i]!=obj._symbTab[i]) return false;
00275 return true;
00276 }
00277 bool SymbTab:: operator!=(const SymbTab & obj){
00278 if ((_symbTab==NULL)|| (obj._symbTab==NULL)) throw Exception("symbol array not allowed" , __FILE__, __LINE__);
00279 if (_nbSymb!=obj._nbSymb) throw Exception("symbol arrays differ in size" , __FILE__, __LINE__);
00280 for (int i=0;i<_nbSymb;i++) if (_symbTab[i]!=obj._symbTab[i]) return true;
00281 return false;
00282 }
00283 bool SymbTab::isIn(short int symb){
00284 if (_symbTab==NULL) throw Exception("symbol array not allowed" , __FILE__, __LINE__);
00285 if ((symb<0)||(symb>=_nbSymb)){
00286 cout << "symb=["<<symb<<"]"<<endl;
00287 throw Exception("symbol out of boundaries" , __FILE__, __LINE__);
00288 }
00289 return _symbTab[symb];
00290 }
00291 void SymbTab::setSymb(short int symb){
00292 if ((symb<0) || (symb>_nbSymb))throw Exception("index out of bound"
00293 , __FILE__, __LINE__);
00294 _symbTab[symb]=true;
00295 }
00296 void SymbTab::show(){
00297 cout <<"[";
00298 for (int i=0;i<_nbSymb;i++)
00299 if (_symbTab[i]) cout <<i<<" ";
00300 cout <<"]"<<endl;
00301 }
00302
00303
00304
00305
00306
00307
00308
00309
00310 ReadMemory::ReadMemory(String inputFilename,int bufSize=100,unsigned long begin=0,unsigned long length=0){
00311 _inputFile.open(inputFilename.c_str());
00312 _bufSize=bufSize;
00313 _buf=new short int[_bufSize];
00314 _idx=0;
00315 _realIdx=0;
00316 _begin=begin;
00317 _length=length;
00318 _segmental=(length>0);
00319 }
00320 ReadMemory::~ReadMemory(){
00321 delete [] _buf;
00322 _inputFile.close();
00323 }
00324 bool ReadMemory::notEof(){
00325 if (!_segmental) return (!_inputFile.eof());
00326 else return ((_idx<_begin+_length)&&(!_inputFile.eof()));
00327 }
00328 bool ReadMemory::eof(){
00329 return !(notEof());
00330 }
00331 bool ReadMemory::readSymb(short int &symb,string & oov){
00332 if (notEof()){
00333 char c;
00334 _inputFile>>c;
00335 while (notEof() && (c<=32)) _inputFile>>c;
00336 if (notEof()){
00337 _inputFile.putback(c);
00338 if ((c >= '0') && (c <= '9')) _inputFile >> symb;
00339 else{
00340 _inputFile>>oov;
00341 symb=-1;
00342 }
00343 if (debug) cout <<"symbol["<<_idx<<"]=["<<symb<<"]"<<endl;
00344 return true;
00345 }
00346
00347 }
00348 return false;
00349 }
00350 bool ReadMemory::lecture(bool init=false){
00351 short int tmpS=-1;
00352 string oov;
00353 if ((!init) && (_idx<_realIdx)){
00354 _idx++;
00355 return true;
00356 }
00357 if ((init)&&(_segmental))
00358 while (_idx<_begin){
00359 if (readSymb(tmpS,oov)){
00360 _idx++;
00361 _realIdx++;
00362 if (tmpS==-1) cerr << "Warning, oov["<<oov<<"] detected and ignored, idx["<<_realIdx<<"]"<<endl;
00363 } else return false;
00364 }
00365
00366 if (notEof()){
00367 if (!init) {_realIdx++;_idx++;}
00368 while (readSymb(tmpS,oov)){
00369 if (tmpS==-1) cerr << "Warning, oov["<<oov<<"] detected and ignored, idx["<<_realIdx<<"]"<<endl;
00370 else {_buf[_realIdx%_bufSize]=tmpS;return true ;}
00371 _realIdx++;_idx++;
00372 }
00373 }
00374 return false;
00375 }
00376 unsigned long ReadMemory::getIdx(){
00377 return _idx;
00378 }
00379 short int ReadMemory::getCurrentSymb(){
00380 return _buf[_idx%_bufSize];
00381 }
00382 void ReadMemory::setIdx(unsigned long idx){
00383 _idx=idx;
00384 }
00385
00386 void Seq::_reserveMem(){
00387 _array=new SymbTab[_maxLength];
00388 for (int i=0;i<_maxLength;i++){
00389 _array[i].setNbSymb(_nbInputSymb);
00390 _array[i].reserveMem();
00391 }
00392 }
00393 Seq::Seq(const Seq& obj){
00394 _maxLength=obj._maxLength;
00395 _length=obj._length;
00396 _nbInputSymb=obj._nbInputSymb;
00397 _reserveMem();
00398 for (int i=0;i<_maxLength;i++) _array[i]=obj._array[i];
00399 }
00400 Seq::Seq(short int maxLength, short int nbInputSymb){
00401 _maxLength=maxLength;
00402 _nbInputSymb=nbInputSymb;
00403 _length=0;
00404 _reserveMem();
00405 }
00406 Seq::~Seq(){
00407 delete [] _array;
00408 }
00409 Seq & Seq::operator=(const Seq& obj){
00410 if (this !=(&obj)){
00411 delete [] _array;
00412 _maxLength=obj._maxLength;
00413 _nbInputSymb=obj._nbInputSymb;
00414 _length=obj._length;
00415 _reserveMem();
00416 for (int i=0;i<_maxLength;i++) _array[i]=obj._array[i];
00417 }
00418 return *(this);
00419 }
00420 SymbTab& Seq::operator[](short int idx){
00421 if (idx>=_maxLength) throw Exception("index > sequence length"
00422 , __FILE__, __LINE__);
00423 return _array[idx];
00424 }
00425 void Seq::setLength(short int length){
00426 if (length>_maxLength) throw Exception("index > sequence length", __FILE__, __LINE__);
00427 _length=length;
00428 }
00429 short int Seq::getLength(){
00430 return _length;
00431 }
00432 void Seq::init(short int length=0){
00433 if ((length>_maxLength)|| (length<0)) throw Exception("index > max sequence length" , __FILE__, __LINE__);
00434 for (int i=length;i<_maxLength;i++)
00435 _array[i].init(false);
00436 _length=length;
00437 }
00438 void Seq::show(){
00439 cout <<"sequence"<<endl;
00440 for (int i=0;i<_length;i++){
00441 cout <<"["<<i<<"]";_array[i].show();}
00442 }
00443
00444 void Seq:: add(SymbTab *ptr){
00445 if (_length==_maxLength) throw Exception("out of bound" , __FILE__, __LINE__);
00446 _array[_length]=(*ptr);
00447 _length++;
00448 }
00449
00450
00451
00452 SequenceDecoder::SequenceDecoder(short int nbInputSymb){
00453 _nbOutputSeq=0;
00454 _nbOutputSeqPart=0;
00455 _nbInputSymb=nbInputSymb;
00456 _seed=NULL;
00457 }
00458 SequenceDecoder:: SequenceDecoder(const SequenceDecoder &obj){
00459 throw Exception("recopy constructor not available"
00460 , __FILE__, __LINE__);
00461 }
00462 SequenceDecoder SequenceDecoder:: operator=(const SequenceDecoder & obj){
00463 throw Exception("= constructor not available"
00464 , __FILE__, __LINE__);
00465 }
00466 SequenceDecoder::~SequenceDecoder(){
00467 _freeTree(_seed);
00468 }
00469 void SequenceDecoder::_freeTree(SequenceDecoderNode *seed){
00470 if (seed==NULL) return;
00471 _freeTree(seed->br);
00472 _freeTree(seed->ch);
00473 delete seed->symbTab;
00474 delete seed;
00475 }
00476 SequenceDecoderNode* SequenceDecoder::_newNode(const SymbTab &symbTab,const short int outputSymb,
00477 SequenceDecoderNode *ch,SequenceDecoderNode *br){
00478 SequenceDecoderNode *ptr=new SequenceDecoderNode;
00479 ptr->symbTab=new SymbTab(symbTab);
00480 ptr->outputSymb=outputSymb;
00481 ptr->ch=ch;
00482 ptr->br=br;
00483 return ptr;
00484 }
00485 SequenceDecoderNode* SequenceDecoder::_findInsert(const SymbTab &symbTab,SequenceDecoderNode *ptr){
00486 if (ptr==NULL) return _newNode(symbTab,-1,NULL,NULL);
00487 while ((ptr->br!=NULL) && (*(ptr->symbTab)!=symbTab)) ptr=ptr->br;
00488 if (*(ptr->symbTab)==symbTab) return ptr;
00489 else{
00490 ptr->br=_newNode(symbTab,-1,NULL,NULL);
00491 return ptr->br;
00492 }
00493 }
00494
00495 void SequenceDecoder::addSequence(Seq &sequence, const short int outputSymb){
00496 if (sequence.getLength()==0) throw Exception(" Try to add a null length sequence" , __FILE__, __LINE__);
00497 SequenceDecoderNode *currentP=_findInsert(sequence[0],_seed);
00498 if (_seed==NULL) _seed=currentP;
00499 for (int step=1;step<sequence.getLength();step++){
00500 SequenceDecoderNode* tmpP=_findInsert(sequence[step],currentP->ch);
00501 if (currentP->ch==NULL)currentP->ch=tmpP;
00502 currentP=tmpP;
00503 }
00504 if (currentP->outputSymb != -1) throw Exception("Try to add a sequence on an existing one" , __FILE__, __LINE__);
00505 currentP->outputSymb=outputSymb;
00506 _nbOutputSeqPart++;
00507 }
00508 void SequenceDecoder::show(){
00509 _show(_seed,0);
00510 }
00511 void SequenceDecoder::_show(SequenceDecoderNode *ptr,short int order){
00512 if (ptr==NULL) return;
00513 cout <<"Order["<<order<<"]"<<endl;
00514 (*ptr->symbTab).show();
00515 if (ptr->outputSymb!=-1) cout<<"outputSymb["<<ptr->outputSymb<<"]"<<endl;
00516 _show(ptr->ch,order+1);
00517 _show(ptr->br,order);
00518 }
00519
00520 void SequenceDecoder::_toFile(SequenceDecoderNode *ptr,short int symb,Seq & currentSeq,ostream & outputFile){
00521 if (ptr==NULL)return;
00522 currentSeq.add(ptr->symbTab);
00523 if (ptr->outputSymb==symb){
00524 outputFile << "Symb["<<symb<<"]=";
00525 for (int i=0; i<currentSeq.getLength();i++){
00526 outputFile<<"{";
00527 for (int s=0;s<currentSeq[i].getNbSymb();s++)
00528 if (currentSeq[i].isIn(s)) outputFile <<s<<" ";
00529 outputFile<<"}";
00530 }
00531 outputFile <<endl;
00532 }
00533 _toFile(ptr->ch,symb,currentSeq,outputFile);
00534 currentSeq.init(currentSeq.getLength()-1);
00535 _toFile(ptr->br,symb,currentSeq,outputFile);
00536 }
00537 void SequenceDecoder::toFile(ostream & outputFile){
00538 for(unsigned long int i=0; i<_nbOutputSeq;i++){
00539 Seq currentSeq(100,_nbInputSymb);
00540 currentSeq.init(0);
00541 _toFile(_seed,i,currentSeq,outputFile);
00542 }
00543 }
00544 SequenceDecoderNode * SequenceDecoder::_load(istream &inputFile){
00545 string tmp;
00546 if (inputFile.eof()) return NULL;
00547 inputFile>>tmp;
00548 if (tmp=="nil") return NULL;
00549 SequenceDecoderNode *ptr=NULL;
00550 SequenceDecoderNode* retPtr=NULL;
00551 bool init=true;
00552 if (tmp=="begin"){
00553 while (tmp!="nil"){
00554 SymbTab sym(_nbInputSymb);
00555 SequenceDecoderNode * newPtr=_newNode(sym,-1,NULL,NULL);
00556 newPtr->ch=_load(inputFile);
00557 inputFile>>newPtr->outputSymb;
00558 short int s;
00559 do{
00560 inputFile>>s;
00561 if (s!=-1)newPtr->symbTab->setSymb(s);
00562 } while (s!=-1);
00563 if (init) {retPtr=newPtr;ptr=newPtr;init=false;}
00564 else{ptr->br=newPtr;ptr=ptr->br;}
00565 inputFile>>tmp;
00566 }
00567 ptr->br=NULL;
00568 return retPtr;
00569 }
00570 else throw Exception("nil or eof is missing" , __FILE__, __LINE__);
00571 }
00572 void SequenceDecoder::load(istream &inputFile){
00573 inputFile>>_nbInputSymb;
00574 inputFile>>_nbOutputSeqPart;
00575 inputFile>> _nbOutputSeq;
00576 _seed=_load(inputFile);
00577 }
00578 void SequenceDecoder::_save(SequenceDecoderNode *ptr,ostream & outputFile){
00579 if (ptr==NULL) outputFile<<"nil"<<endl;
00580 else{
00581 while (ptr!=NULL){
00582 outputFile<<"begin"<<endl;
00583 _save(ptr->ch,outputFile);
00584 outputFile<<ptr->outputSymb<<" ";
00585 for (int s=0;s<ptr->symbTab->getNbSymb();s++)
00586 if (ptr->symbTab->isIn(s)) outputFile<<s<<" ";
00587 outputFile<<"-1"<<endl;
00588 ptr=ptr->br;
00589 }
00590 outputFile<<"nil"<<endl;
00591 }
00592 }
00593 void SequenceDecoder::save(ostream & outputFile){
00594 outputFile<<_nbInputSymb<<endl;
00595 outputFile<<_nbOutputSeqPart<<endl;;
00596 outputFile<< _nbOutputSeq<<endl;
00597 _save(_seed,outputFile);
00598 }
00599
00600 bool SequenceDecoder::_decode(SequenceDecoderNode *ptr,
00601 ReadMemory & inp,ostream & outputFile,unsigned long &begin,short int &decodedSeq){
00602 if (debug) cout << "_decode["<<inp.getIdx()<<"]=["<<inp.getCurrentSymb()<<"]"<<endl;
00603 while ((ptr!=NULL)&&(!ptr->symbTab->isIn(inp.getCurrentSymb()))) ptr=ptr->br;
00604 if (ptr==NULL) return false;
00605
00606 if (ptr->ch==NULL){
00607 if (debug) cout <<"end of sequence detected idx["<<inp.getIdx()<<"]"<<endl;
00608 if (ptr->outputSymb==-1) cerr<< "Warning: not ended sequence at idx["<<inp.getIdx()<<"]"<<endl;
00609 else {outputFile <<begin<<" "<<inp.getIdx()<<" "<<ptr->outputSymb<<endl;decodedSeq=ptr->outputSymb;};
00610 inp.lecture();
00611 begin=inp.getIdx();
00612 return true;
00613 }
00614
00615 if (ptr->outputSymb==-1){
00616 if (debug) cout <<"no output at this level idx["<<inp.getIdx()<<"], go to the next level"<<endl;
00617 if (!inp.lecture()) return false;
00618 return _decode(ptr->ch,inp,outputFile,begin,decodedSeq);
00619 }
00620
00621 if (inp.eof()) {
00622 outputFile <<begin<<" "<<inp.getIdx()<<" "<<ptr->outputSymb<<endl;decodedSeq=ptr->outputSymb;
00623
00624
00625 return true;
00626 }
00627
00628 unsigned long idx=inp.getIdx();
00629 if (debug) cout <<"save idx["<<idx<<"]symb="<<inp.getCurrentSymb()<<endl;
00630 inp.lecture();
00631 if (inp.eof() || (!_decode(ptr->ch,inp,outputFile,begin,decodedSeq))){
00632 outputFile <<begin<<" "<<idx<<" "<<ptr->outputSymb<<endl;decodedSeq=ptr->outputSymb;
00633 inp.setIdx(idx);
00634 if (debug) cout << "comeback to idx["<<idx<<"]symb="<<inp.getCurrentSymb()<<endl;
00635 inp.lecture();
00636 begin=inp.getIdx();
00637 }
00638 return true;
00639 }
00640
00641 void SequenceDecoder::decode(String inputFilename,ostream & outputFile,unsigned long segBegin,unsigned long segLength,
00642 bool overlapMode,bool ngramMode,BNgram &ngram) {
00643 ReadMemory inp(inputFilename,100,segBegin,segLength);
00644
00645 if (verbose) cout <<"Begin decoding for seg["<<segBegin<<","<<segLength<<"] overlap:"<<overlapMode<<endl;
00646 if (!inp.lecture(true)) return;
00647 unsigned long begin=inp.getIdx();
00648 bool flag=true;
00649 while (inp.notEof()){
00650 unsigned long saveIdx=inp.getIdx();
00651 short int seq;
00652 bool ret=_decode(_seed,inp,outputFile,begin,seq);
00653 if (!ret){
00654 cout <<"WARNING, Seq unknown beginning by symb["<<inp.getCurrentSymb()<<"]idx["<<inp.getIdx()<<"]"<<endl;
00655 if (!inp.lecture()) return;
00656 begin=inp.getIdx();
00657 }
00658 if (ngramMode) if (flag){ ngram.beginSeq(seq); flag=false;} else ngram.addSymb(seq);
00659 if (overlapMode){
00660 inp.setIdx(saveIdx);
00661 if (verbose) cout <<"overlapMode, comeback to symb idx:"<<saveIdx<<endl;
00662 if (!inp.lecture()) return;
00663 begin=inp.getIdx();
00664 }
00665 if (verbose) cout <<"decode new sequence at idx["<<inp.getIdx()<<"]"<<endl;
00666 }
00667 ngram.endSeq();
00668 }
00669
00670
00671
00672
00673
00674
00675
00676
00677 int sequenceDecoder(Config &config){
00678 String inputTreeFilename=config.getParam("decoderFilename");
00679 String outputFilename=config.getParam("outputFilename");
00680 String inputFilename=config.getParam("inputFilename");
00681 String labelSelectedFrames;
00682 String labelFilename;
00683 bool label=false;
00684 if (config.existsParam("labelFilename")){
00685 labelFilename=config.getParam("labelFilename");
00686 labelSelectedFrames=config.getParam("labelSelectedFrames");
00687 label=true;
00688 }
00689 bool overlapMode=false;
00690 if (config.existsParam("overlapMode")) overlapMode=config.getParam("overlapMode").toBool();
00691 bool ngramMode=false;
00692 String nOutputF;
00693 unsigned long oNgramOrder=0;
00694 if (config.existsParam("ngramOutputFilename")){
00695 ngramMode=true;nOutputF=config.getParam("ngramOutputFilename");
00696 oNgramOrder=config.getParam("ngramOutputOrder").toLong();
00697 }
00698 BNgram outNgram(oNgramOrder,100);
00699 SequenceDecoder decTree(0);
00700 ifstream inputTreeFile(inputTreeFilename.c_str());
00701 decTree.load(inputTreeFile);
00702 ofstream outputFile(outputFilename.c_str(),ios::out| ios::trunc);
00703 if (label){
00704 SegServer segmentsServer;
00705 LabelServer labelServer;
00706 loadClusterFile(labelFilename,segmentsServer,labelServer,config);
00707 long codeSelectedFrame=labelServer.getLabelIndexByString(labelSelectedFrames);
00708 if (codeSelectedFrame==-1) throw Exception("No segment with ["+labelSelectedFrames+"] label" , __FILE__, __LINE__);
00709 SegCluster& selectedSegments=segmentsServer.getCluster(codeSelectedFrame);
00710 Seg* seg;
00711 selectedSegments.rewind();
00712 while((seg=selectedSegments.getSeg())!=NULL){
00713 decTree.decode(inputFilename,outputFile,seg->begin(),seg->length(),overlapMode,ngramMode,outNgram);
00714 }
00715 }
00716 else decTree.decode(inputFilename,outputFile,0,0,overlapMode,ngramMode,outNgram);
00717 if (ngramMode){
00718 ofstream outputFileNgram(nOutputF.c_str(),ios::out| ios::trunc);
00719 outNgram.show(outputFileNgram);
00720 }
00721
00722 return 0;
00723 }
00724
00725
00726
00727
00728
00729
00730
00731
00732 int sequenceExtractor(Config& config){
00733
00734 int maxOrder=config.getParam("maxOrder").toLong();
00735 unsigned long maxNgram=config.getParam("maxNgram").toLong();
00736 int nbSymb=config.getParam("nbInputSymb").toLong();
00737 const int nbSeq=config.getParam("nbOutputSymb").toLong();
00738 bool equalInputInfo=false;
00739 if (config.existsParam("equalInputInfo")) equalInputInfo=config.getParam("equalInputInfo").toBool();
00740 String ngramFilename=config.getParam("ngramFilename");
00741 String ngramExt=config.getParam("ngramExt");
00742 String outputFilenameInfo=config.getParam("outputInfoFilename");
00743 ofstream outputFileInfo(outputFilenameInfo.c_str(),ios::out| ios::trunc);
00744 String outputFilename=config.getParam("outputFilename");
00745 ofstream outputFile(outputFilename.c_str(),ios::out| ios::trunc);
00746
00747 CommonPartTree initialTree;
00748 if (verbose) cout << "reading the ngram files"<<endl;
00749 for (int order=1;order<=maxOrder;order++){
00750 NGram nGram(order,maxNgram);
00751 nGram.load(ngramFilename+String::valueOf(order)+ngramExt,config);
00752 if (debug) nGram.showTable();
00753 initialTree.addNGram(nGram);
00754 }
00755 initialTree.setTotalCount(initialTree.getTotalChildCount());
00756 if (verbose) cout <<"total input symbols:"<<initialTree.getTotalChildCount()<<endl;
00757 if (debug) {cout<<"initial tree"<<endl; initialTree.show();}
00758
00759
00760 SequenceDecoder seqTree(nbSymb);
00761 Seq currentSeq(maxOrder,nbSymb);
00762
00763 unsigned long remainingInputSymb=initialTree.getTotalChildCount();
00764 bool fEnd=false;
00765 for (int seq=0;(seq<nbSeq) && (!fEnd);seq++){
00766 if (verbose) cout << "begin of the main sequence detection loop, seq:"<<seq<<endl;
00767 unsigned long targetSeqCount=remainingInputSymb/(nbSeq-seq);
00768 if (verbose) cout << "totalchildcount"<<initialTree.getTotalChildCount()<<" Remaining input symb:"<<remainingInputSymb<<endl;
00769 if (verbose) cout <<"target Seq count["<<targetSeqCount<<"]"<<endl;
00770 currentSeq.init();
00771 unsigned long seqCount=initialTree.findMaxSeq(currentSeq);
00772 if (equalInputInfo) seqCount*=currentSeq.getLength();
00773 if (currentSeq.getLength()==0) fEnd=true;
00774 else{
00775 if (verbose){
00776 cout <<"First Seq["<<seq<<"] Length["<<currentSeq.getLength();
00777 if (equalInputInfo)cout<<"] Seq input symbols count[";
00778 else cout <<"] SeqCount[";
00779 cout<<seqCount<<"]"<<endl;currentSeq.show();
00780 }
00781 initialTree.suppressSeq(currentSeq);
00782 if (debug){ cout <<"tree after suppress"<<endl; initialTree.show();}
00783 seqTree.addSequence(currentSeq,seq);
00784 if (debug) {cout <<"seq tree after adding initial sequence"<<endl;seqTree.show();}
00785 short int length=currentSeq.getLength()-1;
00786 while ((seqCount<targetSeqCount)&&(length>=0)){
00787 bool end=false;
00788 while ((!end)&&(length>=0)&&(seqCount<targetSeqCount)){
00789 currentSeq.init(length);
00790 if (debug) {cout <<"current path"<<endl;currentSeq.show();}
00791 unsigned long deltaSeqCount=initialTree.findMaxEndSeq(currentSeq);
00792 if (equalInputInfo) deltaSeqCount*=currentSeq.getLength();
00793 end=(deltaSeqCount==0)||(currentSeq.getLength()==0);
00794 if (!end){
00795 seqCount+=deltaSeqCount;
00796 if (verbose){
00797 cout <<"Seq["<<seq<<"] add new seq Length["<<currentSeq.getLength();
00798 if (equalInputInfo)cout<<"] Seq input symbols count[";
00799 else cout <<"] SeqCount[";
00800 cout<<seqCount<<"]"<<endl;currentSeq.show();
00801 }
00802 initialTree.suppressSeq(currentSeq);
00803 if (debug){ cout <<"tree after suppress seq"<<endl; initialTree.show();}
00804 seqTree.addSequence(currentSeq,seq);
00805 if (debug){cout <<"seq tree after adding seq"<<endl;seqTree.show();}
00806 length=currentSeq.getLength()-1;
00807 }
00808 else length--;
00809 }
00810 }
00811 remainingInputSymb-=seqCount;
00812 if (seqCount==0) fEnd=true;
00813 else{
00814 outputFileInfo<<"Sequence["<<seq<<"] Count["<<seqCount<<"]"<<endl;
00815 seqTree.setNbOutputSeq(seq+1);
00816 }
00817 }
00818 }
00819
00820 if (verboseLevel>1){cout <<"FINAL SEQUENCE TREE"<<endl; seqTree.show();}
00821 if (verbose) seqTree.toFile(outputFileInfo);
00822 seqTree.save(outputFile);
00823
00824 return 0;
00825 }
00826
00827 #endif