%
% Generation of a Data Base for Speaker Recognition
%
% The purpose of this m-file is to generate a data base,which characterizes
% different speakers by their speech spectrum in the band 0 - 4000 Hz.
% 6 time windows of 3 s duration with sampling rate 44100 are used to
% compute the amplitude spectrum of a speech recording.
% The arithmetic mean of 6 obtained spectra is built and moreover the mean
% of 25 consecutive amplitudes as a group is stored. For the bandwidth
% 4000 Hz we thus save 480 amplitude mean values in about 8.33 Hz steps as
% a spectral characterization of a speaker.
%
% With the m-file recognize_speaker.m it is tested, whether we can identify
% then a test person by a 6 s test speech.
%
% R. Brigola, May 2010
clear all;
start_time=cputime;
N=19; % number of speech recordings for the test
T=3; % time window duration
sample_max=12002; % to get a frequency range 0-4000 Hz
M=480; % Number of stored mean amplitudes
D=floor(sample_max/M); % number of consecutive amplitudes,
% for which we build a mean
Fs=44100; % Sampling frequency
blocksize=T*Fs; % Number of Samples per time frame
segments=6; % Number of time frames per speaker
% Needs 18 s recording of the speaker
sample_no=segments*blocksize; % Total number of samples
A(1:segments,1:blocksize,1:N)=0.;% Matrix for fft, only 1. Channel
% 1 time segment per row,
sumvec(1:M)=0.; % for the means with 8,33 Hz steps
db_matrix(1:N,1:M)=0.; % data base matrix: N rows for N speakers,
% 480 columns per speaker
% Loop over all the available recordings with names track1.wav...trackN.wav
% Usually the sampling frequency Fs is 44100 samples per second
for no=1:N;
str=int2str(no);
audio_input=strcat('track',str,'.wav'); % input filename
data=wavread(audio_input,sample_no);
vec(1:sample_max)=0.;
for k=1:segments; % fft without weigth functions
A(k,1:blocksize,no)=fft(data(blocksize*(k-1)+1:blocksize*k,1));
end;
for k=1:segments;
vec=vec+abs(A(k,1:sample_max,no)); % for building of means per time
end; % segment
vec=vec./segments; % we use the arithmetic mean
for m=1:M;
sumvec(m)=sum(vec(D*(m-1)+2:D*m+2))./D; % means with about 8.33 Hz
end; % steps
db_matrix(no,:)=sumvec./norm(sumvec); % save these spectral means
% (without the dc-gains)
end;
save speaker_fingerprint_db db_matrix;
computation_time=cputime - start_time;