00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042
00043
00044
00045
00046
00047
00048
00049
00050
00051
00052
00053
00054
00055 #if !defined(ALIZE_NGram_cpp)
00056 #define ALIZE_LabelNGram_cpp
00057 #include <cstdio>
00058 #include <cassert>
00059 #include <cmath>
00060 #include <liatools.h>
00061 #include "LabelNGram.h"
00062
00063
00064 using namespace alize;
00065 using namespace std;
00066 NGram::NGram(unsigned long order, unsigned long nb){
00067 _order=order;
00068 _nb=nb;
00069 _sym=new short int [_order*_nb];
00070 _count=new unsigned long [_nb];
00071 _totalCount=0;
00072 }
00073 NGram:: ~NGram(){
00074 _order=0;
00075 _nb=0;
00076 delete [] _sym;
00077 delete [] _count;
00078 }
00079
00080 NGram::NGram(const NGram &ng){
00081 _order=ng._order;
00082 _nb=ng._nb;
00083 _totalCount=ng._totalCount;
00084 _sym=new short int [_order*_nb];
00085 _count=new unsigned long [_nb];
00086 memcpy(_sym, ng._sym, _nb*_order*sizeof(short int));
00087 memcpy(_count, ng._count, _nb*sizeof(unsigned long));
00088 }
00089 const NGram& NGram::operator=(const NGram& ng)
00090 {
00091 if (this==&ng)
00092 return ng;
00093 if ((ng._nb*ng._order)!=(_nb*_order)){
00094 delete _sym;
00095 delete _count;
00096 _sym=new short int [ng._order*ng._nb];
00097 _count=new unsigned long [ng._nb];
00098 }
00099 _nb=ng._nb;
00100 _order=ng._order;
00101 _totalCount=ng._totalCount;
00102 memcpy(_sym, ng._sym, _nb*_order*sizeof(short int));
00103 memcpy(_count, ng._count, _nb*sizeof(unsigned long));
00104 return *this;
00105 }
00106 void NGram::setSize(const unsigned size){
00107 if (size<=_nb) _nb=size;
00108 else throw Exception("Resize is allowad only for reducing the size"
00109 , __FILE__, __LINE__);
00110 short int * sym=new short int [_order*_nb];
00111 unsigned long *count=new unsigned long [_nb];
00112 memcpy(sym,_sym, _nb*_order*sizeof(short int));
00113 memcpy(count,_count, _nb*sizeof(unsigned long));
00114 delete _sym;
00115 delete _count;
00116 _sym=sym;
00117 _count=count;
00118 }
00119
00120 short int NGram::getSymbol(const unsigned idx,const unsigned long o){
00121 if ((idx<0) || (idx>=_nb))
00122 throw Exception("out of array"
00123 , __FILE__, __LINE__);
00124 return _sym[(idx*_order)+o];
00125 }
00126 unsigned long NGram::getCount(const unsigned idx){
00127 if ((idx<0) || (idx>=_nb))
00128 throw Exception("out of array"
00129 , __FILE__, __LINE__);
00130 return _count[idx];
00131 }
00132 void NGram::setCount(const unsigned idx, const unsigned long &count){
00133 if ((idx<0) || (idx>=_nb))
00134 throw Exception("out of array"
00135 , __FILE__, __LINE__);
00136 _count[idx]=count;
00137 }
00138 unsigned long NGram::getTotalCount(){
00139 return _totalCount;
00140 }
00141 void NGram::setTotalCount(const unsigned long &count){
00142 _totalCount=count;
00143 }
00144 void NGram::setSymbol(const unsigned idx,const unsigned long o, const short int sym, unsigned long count=0){
00145 if ((idx<0) || (idx>=_nb))
00146 throw Exception("out of array"
00147 , __FILE__, __LINE__);
00148 _sym[(idx*_order)+o]=sym;
00149 }
00150 void NGram::showTable(ostream &out){
00151 for(unsigned idx=0;idx<_nb;idx++){
00152 out<<"Sym["<<idx<<"]=";
00153 for (unsigned long s=0;s<_order;s++)
00154 out<<"["<<getSymbol(idx,s)<<"]";
00155 out<<"count["<<getCount(idx)<<"]"<<endl;
00156 }
00157 }
00158
00159
00160
00161
00162 void NGram::load(const String filename,Config &config){
00163 XList input(filename,config);
00164 XLine *linep;
00165 input.getLine(0);
00166
00167 unsigned long idx=0;
00168 while (((linep=input.getLine()) != NULL)&&(idx<getSize())){
00169 for (unsigned long i=0;i<getOrder();i++){
00170 short int a=linep->getElement(i).toLong();
00171 setSymbol(idx,i,a);
00172 }
00173 if (linep->getElementCount()==(getOrder()+1)){
00174 unsigned long count=linep->getElement(getOrder()).toLong();
00175 setCount(idx,count);_totalCount+=count;}
00176 else setCount(idx,0);
00177 idx++;
00178 }
00179 if (idx!=getSize()){
00180 cout << "WARNING ! Number of ngram in the file["<<idx<<"] < to the number requested ["<<getSize()<<"]"<<endl;
00181 setSize(idx);
00182 }
00183 if (verboseLevel>1){
00184 cout <<"load symbol table from ["<<filename <<"]"<<endl;
00185 showTable();
00186 }
00187 }
00188 bool isNGram(short int *sym,NGram &tabS,unsigned long & tag){
00189 bool find=false;
00190 unsigned long idx;
00191 for (idx=0;(!find) && (idx<tabS.getSize());idx++){
00192 find=true;
00193 for (unsigned long s=0;(find) && (s<tabS.getOrder());s++)
00194 find=(sym[s]==tabS.getSymbol(idx,s));
00195 }
00196 if (find){
00197 tag=idx;
00198 return true;
00199 }
00200 else return false;
00201 }
00202
00203 short int recognizeSymbol(unsigned long &idxFrame,unsigned long end,ULongVector &tabS){
00204 unsigned long sym=tabS[idxFrame];
00205 while ((idxFrame<end)&&(tabS[idxFrame]==sym))idxFrame++;
00206 return sym;
00207 }
00208
00209
00210 void moveTab(unsigned long *begin,short int *sym,unsigned long *end,unsigned long order){
00211 for (unsigned long i=0;i<order-1;i++){
00212 begin[i]=begin[i+1];
00213 end[i]=end[i+1];
00214 sym[i]=sym[i+1];
00215 }
00216 }
00217
00218 void computeLabelNGram(NGram & NG,SegCluster &cluster,SegCluster &clusterOut,ULongVector &tabS,unsigned long nbSym){
00219 unsigned long begin[100];
00220 short int sym[100];
00221 unsigned long end[100];
00222 SegServer & segServerOut=clusterOut.getServer();
00223 cluster.rewind();
00224 Seg* seg;
00225 while((seg=cluster.getSeg())!=NULL){
00226 unsigned long idxFrame=seg->begin();
00227 unsigned long endS=endSeg(seg);
00228 if (endS>=nbSym) endS=nbSym;
00229 if (idxFrame>endS) idxFrame=endS;
00230 unsigned long beginOOV=idxFrame;
00231 bool oov=true;
00232 if (debug) cout <<"begin Seg["<<idxFrame<<"]"<<endl;
00233
00234 for (unsigned long n=0;(idxFrame<endS) &&(n<NG.getOrder()-1);n++){
00235 begin[n]=idxFrame;
00236 sym[n]=recognizeSymbol(idxFrame,endS,tabS);
00237 end[n]=idxFrame-1;
00238 if (debug) cout <<"sym ["<<sym[n]<<"] begin["<<begin[n]<<"] end["<<end[n]<<"] idxframe["<<idxFrame<<"]"<<endl;
00239 }
00240 while(idxFrame<endS){
00241 begin[NG.getOrder()-1]=idxFrame;
00242 sym[NG.getOrder()-1]=recognizeSymbol(idxFrame,endS,tabS);
00243 end[NG.getOrder()-1]=idxFrame-1;
00244 if (debug) cout <<"sym ["<<sym[NG.getOrder()-1]<<"] begin["<<begin[NG.getOrder()-1]
00245 <<"] end["<<end[NG.getOrder()-1]<<"] idxframe["<<idxFrame<<"]"<<endl;
00246 unsigned long tag;
00247 if (isNGram(sym,NG,tag)){
00248 if ((oov)&&(beginOOV<begin[0])){
00249 if (debug) cout <<"OOV1 begin["<<beginOOV <<"] end["<<begin[0]-1<<"]"<<endl;
00250 Seg &segTmp=segServerOut.createSeg(beginOOV,begin[0]-beginOOV,0,"oov",seg->sourceName());
00251 clusterOut.add(segTmp);
00252 }
00253 if (debug) cout <<"NGRAM ["<<tag<<"] begin["<<begin[0] <<"] end["<<end[NG.getOrder()-1]<<"]"<<endl;
00254 Seg &segTmp=segServerOut.createSeg(begin[0],end[NG.getOrder()-1]-begin[0]+1,0,String::valueOf(tag),seg->sourceName());
00255 clusterOut.add(segTmp);
00256 beginOOV=idxFrame;
00257 oov=false;
00258 }
00259 else oov=true;
00260 moveTab(begin,sym,end,NG.getOrder());
00261 }
00262 if (oov){
00263 Seg &segTmp=segServerOut.createSeg(beginOOV,idxFrame-beginOOV,0,"oov",seg->sourceName());
00264 clusterOut.add(segTmp);
00265 if (debug) cout <<"OOV2 begin["<<beginOOV <<"] end["<<idxFrame-1<<"]"<<endl;
00266 }
00267 }
00268 }
00269
00270
00271
00272 unsigned long loadSymbol(const String &filename,const String &type,ULongVector & ret,Config &config){
00273
00274 if (type=="ascii"){
00275 unsigned long nbSym=0;
00276 XList infile(filename,config);
00277 XLine list=infile.getAllElements();
00278 ret.setSize(list.getElementCount());
00279 nbSym=list.getElementCount();
00280 for (unsigned long i=0;i<nbSym;i++){
00281 String *tmp=list.getElement();
00282 if ((*tmp)!="oov")
00283 ret[i]=tmp->toLong();
00284 else ret[i]=OOV;
00285 }
00286 if (debug) cout << "nb sym:"<<nbSym<<endl;
00287 return nbSym;
00288 }
00289 else throw Exception(type+" file type non recognized for a symbol file"
00290 , __FILE__, __LINE__);
00291 }
00292
00293
00294 int labelNGram(Config& config)
00295 {
00296 if (config.existsParam("debug"))debug=true; else debug=false;
00297 if (config.existsParam("verbose"))verbose=true; else verbose=false;
00298 String extOutputLabel=".sym.lbl";
00299 if (config.existsParam("saveLabelFileExtension")) extOutputLabel=config.getParam("saveLabelFileExtension");
00300 String pathOutput="./";
00301 if (config.existsParam("labelOutputPath")) pathOutput=config.getParam("labelOutputPath");
00302 String extSymbol=".sym";
00303 if (config.existsParam("symbolFileExtension")) extSymbol=config.getParam("symbolFileExtension");
00304 String pathSymbol="./";
00305 if (config.existsParam("symbolPath")) pathSymbol=config.getParam("symbolPath");
00306 String formatSymbol="ascii";
00307 if (config.existsParam("symbolFormat")) pathSymbol=config.getParam("symbolFormat");
00308
00309 String NGramFilename=config.getParam("NGramFilename");
00310 unsigned long NGramOrder=3;
00311 if (config.existsParam("NGramOrder")) NGramOrder=config.getParam("NGramOrder").toLong();
00312 unsigned long NGramSelected=16;
00313 if (config.existsParam("NGramSelected")) NGramSelected=config.getParam("NGramSelected").toLong();
00314 NGram NGramTable(NGramOrder,NGramSelected);
00315 NGramTable.load(NGramFilename,config);
00316
00317 String inputFilename=config.getParam("inputFilename");
00318 String labelSelectedFrames=config.getParam("labelSelectedFrames");
00319 XLine inputFileList;
00320 try{
00321 if (inputFilename.endsWith(".lst")){
00322 XList tmp(inputFilename,config);
00323 inputFileList=tmp.getAllElements();
00324 }
00325 else inputFileList.addElement(inputFilename);
00326 String *p;
00327 while ((p=inputFileList.getElement())){
00328 String& filename=*p;
00329 if (verbose)
00330 cout <<"labelNGram file["<<filename<<"] Table["<<NGramFilename<<"] Order["<<NGramOrder<<"] Selected["<<NGramSelected<<"]"<<endl;
00331 SegServer segServer;
00332 LabelServer labelServer;
00333 loadClusterFile(filename,segServer,labelServer,config);
00334 long codeSelectedFrame=labelServer.getLabelIndexByString(labelSelectedFrames);
00335 if (codeSelectedFrame==-1){
00336 cout << " WARNING - NO DATA with the label["<<labelSelectedFrames<<"] in file ["<<filename<<"]"<<endl;
00337 exit(0);
00338 }
00339 SegCluster& cluster=segServer.getCluster(codeSelectedFrame);
00340 ULongVector tabS;
00341 unsigned long nbSym=loadSymbol(pathSymbol+filename+extSymbol,formatSymbol,tabS,config);
00342 SegServer segServerOutput;
00343 SegCluster& clusterOut=segServerOutput.createCluster(0,labelSelectedFrames,cluster.sourceName());
00344
00345 computeLabelNGram(NGramTable,cluster,clusterOut,tabS,nbSym);
00346
00347
00348 if (verbose){
00349 cout <<"File["<<filename<<"]" <<endl;
00350 cout << "Output the new label file in ["<<pathOutput+filename+extOutputLabel <<"]"<<endl;
00351 }
00352 outputLabelFile(clusterOut,pathOutput+filename+extOutputLabel,config);
00353 }
00354 }
00355
00356
00357 catch (Exception& e)
00358 {
00359 cout << e.toString().c_str() << endl;
00360 }
00361 return 0;
00362 }
00363
00364
00365
00366 #endif