BNGram.cpp

Go to the documentation of this file.
00001 /*
00002 This file is part of LIA_RAL which is a set of software based on ALIZE
00003 toolkit for speaker recognition. ALIZE toolkit is required to use LIA_RAL.
00004 
00005 LIA_RAL project is a development project was initiated by the computer
00006 science laboratory of Avignon / France (Laboratoire Informatique d'Avignon -
00007 LIA) [http://lia.univ-avignon.fr <http://lia.univ-avignon.fr/>]. Then it
00008 was supported by two national projects of the French Research Ministry:
00009         - TECHNOLANGUE program [http://www.technolangue.net]
00010         - MISTRAL program [http://mistral.univ-avignon.fr]
00011 
00012 LIA_RAL is free software: you can redistribute it and/or modify
00013 it under the terms of the GNU Lesser General Public License as
00014 published by the Free Software Foundation, either version 3 of
00015 the License, or any later version.
00016 
00017 LIA_RAL is distributed in the hope that it will be useful,
00018 but WITHOUT ANY WARRANTY; without even the implied warranty of
00019 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
00020 GNU Lesser General Public License for more details.
00021 
00022 You should have received a copy of the GNU Lesser General Public
00023 License along with LIA_RAL.
00024 If not, see [http://www.gnu.org/licenses/].
00025 
00026 The LIA team as well as the LIA_RAL project team wants to highlight the
00027 limits of voice authentication in a forensic context.
00028 The "Person Authentification by Voice: A Need of Caution" paper
00029 proposes a good overview of this point (cf. "Person
00030 Authentification by Voice: A Need of Caution", Bonastre J.F.,
00031 Bimbot F., Boe L.J., Campbell J.P., Douglas D.A., Magrin-
00032 chagnolleau I., Eurospeech 2003, Genova].
00033 The conclusion of the paper of the paper is proposed bellow:
00034 [Currently, it is not possible to completely determine whether the
00035 similarity between two recordings is due to the speaker or to other
00036 factors, especially when: (a) the speaker does not cooperate, (b) there
00037 is no control over recording equipment, (c) recording conditions are not
00038 known, (d) one does not know whether the voice was disguised and, to a
00039 lesser extent, (e) the linguistic content of the message is not
00040 controlled. Caution and judgment must be exercised when applying speaker
00041 recognition techniques, whether human or automatic, to account for these
00042 uncontrolled factors. Under more constrained or calibrated situations,
00043 or as an aid for investigative purposes, judicious application of these
00044 techniques may be suitable, provided they are not considered as infallible.
00045 At the present time, there is no scientific process that enables one to
00046 uniquely characterize a persones voice or to identify with absolute
00047 certainty an individual from his or her voice.]
00048 
00049 Copyright (C) 2004-2010
00050 Laboratoire d'informatique d'Avignon [http://lia.univ-avignon.fr]
00051 LIA_RAL admin [alize@univ-avignon.fr]
00052 Jean-Francois Bonastre [jean-francois.bonastre@univ-avignon.fr]
00053 */
00054 
00055 #if !defined(ALIZE_BNgram_cpp)
00056 #define ALIZE_BNgram_cpp
00057 
00058 #include <iostream>  
00059 #include <fstream>  // pour outFile
00060 #include <cstdio>   // pour printf()
00061 #include <cassert> // pour le debug pratique
00062 #include <cmath>
00063 #include <liatools.h>
00064 #include "BNgram.h"
00065 
00066 
00067 // BTREE based NGRAM FUNCTIONS
00068 
00069 // Private
00070 BNgramNode * BNgram::_newNode(short int symb,BNgramNode*br=NULL){
00071   BNgramNode * ptr=new BNgramNode;
00072   ptr->child=new BNgramNode* [_nbElt];
00073   ptr->count=new unsigned long [_nbElt];
00074   for (short int i=0;i<_nbElt;i++){ptr->child[i]=NULL;ptr->count[i]=0;}
00075   ptr->idx=(symb/_nbElt)*_nbElt;
00076   ptr->brother=br;
00077   return ptr;
00078 }
00079 BNgramNode * BNgram::_findInsert(BNgramNode* ptr,short int idx){
00080   // TAKE CARE, A LIST ALWAYS BEGIN BY A NODE WITH IDX=0
00081   if (idx<(ptr->idx+_nbElt))return ptr;                   // current node is the good one
00082   while ((ptr->brother!=NULL) && (idx>=(ptr->brother->idx+_nbElt))) ptr=ptr->brother;
00083   if (ptr->brother==NULL) {ptr->brother=_newNode(idx); return ptr->brother;}          // new node at the end
00084   if ((idx>=ptr->brother->idx)&& (idx<ptr->brother->idx+_nbElt)) return ptr->brother; // the node exists
00085   ptr->brother=_newNode(idx,ptr->brother); return ptr->brother;                       // new node in the midle;
00086 }
00087 void BNgram::_add(short int symb,bool first,short int nb=1){
00088   // TAKE CARE, A LIST ALWAYS BEGIN BY A NODE WITH IDX=0
00089   // 1: seed memorizes the good seed for the n level. It is set and created if necessary by the level n-1
00090   static BNgramNode* seed; 
00091   static short int level; // level is the current ngram order set by the level n-1 (i.e. at the end of this function)
00092   if (first) {
00093     seed=_seed;           // If first level (unigram), begin at the Btree seed, else searchSeed and level are ready
00094     level=0;
00095     _totalCount++;
00096   }
00097   if (debug) cout <<"add symb["<<symb<<"] Level["<<level<<"]"<<endl;
00098   // 2: find the good node and add the count 
00099   seed=_findInsert(seed,symb); // find the good node, create it if necessary (3 takes care of the begin of the list)
00100   seed->count[symb%_nbElt]+=nb;
00101   
00102   //3:  Now, prepare the work for the next level
00103   if (seed->child[symb%_nbElt]==NULL) seed->child[symb%_nbElt]=_newNode(0);
00104   seed=seed->child[symb%_nbElt];
00105   level++;
00106 }
00107 void BNgram::_freeTree(BNgramNode *seed){
00108   if (seed==NULL) return;
00109   for (short int i=0;i<_nbElt;i++)_freeTree(seed->child[i]);
00110   delete [] seed->child;
00111   delete [] seed->count;
00112   delete seed;
00113 }
00114 void BNgram::_show(BNgramNode * seed,String ngramS="",ostream & str=cout){
00115   if (seed==NULL) return;
00116   for (BNgramNode* ptr=seed;(ptr!=NULL);ptr=ptr->brother)
00117     for (short int i=0;i<_maxLength;i++)
00118       if (ptr->count[i]!=0){
00119         String newNgramS=ngramS+" "+String::valueOf(ptr->idx+i);
00120         str<<newNgramS<<" "<<ptr->count[i]<<endl;
00121         _show(ptr->child[i],newNgramS,str);
00122       } 
00123 }
00124 // Public
00125 BNgram::BNgram(unsigned long maxLength,unsigned long nbElementByNode=100){
00126   _maxLength=maxLength;
00127   _totalCount=0;
00128   //  _nb=0;
00129   _circArray=new unsigned long [_maxLength];
00130   _readIdx=0;
00131   _nbElt=nbElementByNode;
00132   _seed=_newNode(0);
00133 }
00134 BNgram::~BNgram(){
00135   delete [] _circArray;
00136   _freeTree(_seed);
00137 }
00138 void BNgram::beginSeq(short int symb,short int nb){
00139   _readIdx=0;
00140   _circArray[_readIdx++]=symb;
00141 }
00142 void BNgram::addSymb(short int symb,short int nb){
00143   _circArray[_readIdx%_maxLength]=symb;
00144   if (_readIdx>=(unsigned long) (_maxLength-1)){
00145     _add(_circArray[(_readIdx-(_maxLength-1))%_maxLength],true,nb);
00146     for (unsigned long i=(_readIdx-(_maxLength-1))+1;i<=_readIdx;i++)
00147       _add(_circArray[i%_maxLength],false,nb);
00148   }
00149   _readIdx++;
00150 }
00151 void BNgram::endSeq(short int nb){ // Take care, readIdx was wrongly incremented 
00152   for (unsigned long length=_maxLength-1;length>0;length--)
00153     if (_readIdx>=length){
00154       _add(_circArray[(_readIdx-length)%_maxLength],true,nb);
00155       for (unsigned long i=(_readIdx-length)+1;i<_readIdx;i++)
00156         _add(_circArray[i%_maxLength],false,nb);
00157     }
00158 }
00159 
00160 void BNgram::show(ostream & str=cout){
00161   String ngramS="";
00162   str<<"symbol total count["<<_totalCount<<"]"<<endl;
00163   _show(_seed,ngramS,str);
00164 }
00165 
00166 // TEST MAIN FUNCTON
00167 void testBNgram(Config &config){
00168   short int order=config.getParam("maxOrder").toLong();
00169   short int symb=-1;
00170   BNgram ngram(order,2);
00171   cin>>symb;
00172  
00173   bool begin=true;
00174   while (symb!=-1){
00175     if (begin){
00176        ngram.beginSeq(symb);
00177        begin=false;
00178     }
00179     else ngram.addSymb(symb);
00180     cin>>symb;
00181   }
00182   ngram.endSeq();
00183   ngram.show();
00184 }
00185 #endif