-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathspec_trk.m
executable file
·168 lines (145 loc) · 6.37 KB
/
spec_trk.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
function[SPitch, VUVSPitch, pAvg, pStd]= spec_trk(Data, Fs, VUVEnergy, Prm)
%SPEC_TRK Spectral pitch tracking for YAAPT pitch tracking
%
% [SPitch, VUVSPitch, pAvg, pStd]=spec_trk(Data, Fs, VUVEnergy, Prm)
% computes estimates of pitch using nonlinearly processed
% speech (typically square or absolute value) and frequency domain processing
% Search for frequencies which have energy at multiplies of that frequency
%
%INPUTS:
% Data: Nonlinearly processed signal, filtered with F1. (~50 to 1500Hz)
% Fs: The sampling frequency.
% VUVEnergy: Voiced/ unvoiced decision, depending on nlfer
% Prm: Parameters
%
%OUTPUTS:
% SPitch: The spectral Pitch track, with the unvoiced regions filled using interpolation.
% VUVSPitch: The spectral Pitch track, with unvoiced regions set at zero
% pAvg: The average of the Pitch track value.
% pStd: The standard deviation in the track.
% Creation date: Feb 20, 2006
% Revision dates: Feb 22, 2006, March 11, 2006, April 5, 2006,
% Jun 25, 2006, June 13, 2007
% Programers: Hongbing Hu, Princy, Zahorian
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% This file is a part of the YAAPT program, designed for a fundamental
% frequency tracking algorithm that is extermely robust for both high quality
% and telephone speech.
% The YAAPT program was created by the Speech Communication Laboratory of
% the state university of New York at Binghamton. The program is available
% at http://www.ws.binghamton.edu/zahorian as free software. Further
% information about the program could be found at "A spectral/temporal
% method for robust fundamental frequency tracking," J.Acosut.Soc.Am. 123(6),
% June 2008.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%-- PARAMETERS: set up all of these parameters --------------------------------
nframesize = fix(Prm.frame_length*Fs/1000);
nframejump = fix(Prm.frame_space*Fs/1000);
noverlap= nframesize-nframejump; % overlap width in sample
numframes = fix((length(Data)-noverlap)/nframejump);
% use larger frame size
nframesize = nframesize*2;
% Max number of peak candidates found
maxpeaks = Prm.shc_maxpeaks;
nfftlength = Prm.fft_length; % FFT length
% Resolution of spectrum
delta = Fs/nfftlength;
% Window width in sample
window_length = fix(Prm.shc_window/delta);
% Half window width
half_winlen = fix(Prm.shc_window/(delta*2));
% Max range of SHC
max_SHC = fix((Prm.f0_max+Prm.shc_pwidth*2)/delta);
% Min range of SHC
min_SHC = ceil(Prm.f0_min/delta);
% Number of harmomics considered
numharmonics = Prm.shc_numharms;
%-- INITIALIZATION -----------------------------------------------------------
CandsPitch = zeros(maxpeaks, numframes);
CandsMerit = ones(maxpeaks, numframes);
% Zero padding
Data(end:(numframes-1)*nframejump+nframesize) = 0;
%-- MAIN ROUTINE --------------------------------------------------------------
% Compute SHC for voiced frame
Kaiser_window = kaiser(nframesize);
SHC = zeros(1,max_SHC);
for frame = 1:numframes
if (VUVEnergy(frame) > 0)
firstp = 1+(frame-1)*(nframejump);
Signal = Data(firstp:firstp+nframesize-1) .* Kaiser_window;
Signal = Signal - mean(Signal);
Magnit = abs(fft(Signal , nfftlength));
% Compute SHC (Spectral Harmonic Correlation)
for k=min_SHC:max_SHC;
MultHarms = ones(1, window_length);
% Set each harmonics, 1=f0 2=2*f0 etc
for n=1:numharmonics+1 % Number of harmomics considered
nstart = n*k-half_winlen;
if nstart < 0
Harm = zeros(1, window_length);
Harm(abs(nstart)+1:window_length)=Magnit(1:nstart+window_length);
else
Harm(1:window_length)=Magnit(nstart+1:nstart+window_length);
end
MultHarms = MultHarms .* Harm;
end
SHC(k) = sum(MultHarms);
end
[CandsPitch(:,frame), CandsMerit(:,frame)]=peaks(SHC,delta, maxpeaks, Prm);
else
% if energy is low, let frame be considered as unvoiced
CandsPitch(:,frame) = zeros(1,maxpeaks);
CandsMerit(:,frame) = ones(1, maxpeaks);
end
end
% Extract the Pitch candidates of voiced frames for the future Pitch selection
SPitch = CandsPitch(1,:);
Idx_voiced = SPitch > 0;
VCandsPitch = CandsPitch(:,Idx_voiced);
VCandsMerit = CandsMerit(:,Idx_voiced);
% Average, STD of the first choice candidates
avg_voiced = mean(VCandsPitch(1,:));
std_voiced = std(VCandsPitch(1,:));
% Weight the deltas, so that higher merit candidates are considered
% more favorably
delta1 = abs((VCandsPitch - 0.8*avg_voiced)).*(3-VCandsMerit);
% Interpolation of the weigthed candidates
[VR, Idx] = min(delta1);
VPeak_minmrt = zeros(1, length(Idx));
VMerit_minmrt = zeros(1, length(Idx));
for n=1: length(Idx)
VPeak_minmrt(n) = VCandsPitch(Idx(n), n);
VMerit_minmrt(n) = VCandsMerit(Idx(n), n);
end
VPeak_minmrt = medfilt1(VPeak_minmrt, max(1,Prm.median_value-2));
% Replace the lowest merit candidates by the median smoothed ones
% computed from highest merit peaks above
for n=1: length(Idx)
VCandsPitch(Idx(n), n) = VPeak_minmrt(n);
% Assign merit for the smoothed peaks
VCandsMerit(Idx(n), n) = VMerit_minmrt(n);
end
% Use dynamic programming to find best overal path among pitch candidates
% Dynamic weight for transition costs
% balance between local and transition costs
weight_trans = Prm.dp5_k1*std_voiced/avg_voiced;
[VPitch] = dynamic5(VCandsPitch, VCandsMerit,weight_trans);
VPitch = medfilt1(VPitch, max(Prm.median_value-2, 1));
% Computing some statistics from the voiced frames
pAvg = mean(VPitch);
pStd = std(VPitch);
% Streching out the smoothed pitch track
SPitch(Idx_voiced) = VPitch;
% Interpolating thru unvoiced frames
SPitch(1) = pAvg;
SPitch(end) = pAvg;
[Indrows, Indcols, Vals] = find(SPitch);
SPitch = interp1(Indcols, Vals, [1:numframes], 'pchip');
FILTER_ORDER = 3;
flt_b = (ones(1,FILTER_ORDER))/FILTER_ORDER ;
flt_a = 1;
SPitch = filter(flt_b, flt_a, SPitch);
% Create pitch track with Voiced/Unvoiced decision
VUVSPitch = SPitch;
VUVSPitch(VUVEnergy==0) = 0;