PocketSphinx 5prealpha
state_align_search.c
Go to the documentation of this file.
1/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
2/* ====================================================================
3 * Copyright (c) 2010 Carnegie Mellon University. All rights
4 * reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 *
18 * This work was supported in part by funding from the Defense Advanced
19 * Research Projects Agency and the National Science Foundation of the
20 * United States of America, and the CMU Sphinx Speech Consortium.
21 *
22 * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
23 * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
24 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
26 * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 *
34 * ====================================================================
35 *
36 */
37
42#include "state_align_search.h"
43
44static int
45state_align_search_start(ps_search_t *search)
46{
48
49 /* Activate the initial state. */
50 hmm_enter(sas->hmms, 0, 0, 0);
51
52 return 0;
53}
54
55static void
56renormalize_hmms(state_align_search_t *sas, int frame_idx, int32 norm)
57{
58 int i;
59 for (i = 0; i < sas->n_phones; ++i)
60 hmm_normalize(sas->hmms + i, norm);
61}
62
63static int32
64evaluate_hmms(state_align_search_t *sas, int16 const *senscr, int frame_idx)
65{
66 int32 bs = WORST_SCORE;
67 int i;
68
69 hmm_context_set_senscore(sas->hmmctx, senscr);
70
71 for (i = 0; i < sas->n_phones; ++i) {
72 hmm_t *hmm = sas->hmms + i;
73 int32 score;
74
75 if (hmm_frame(hmm) < frame_idx)
76 continue;
77 score = hmm_vit_eval(hmm);
78 if (score BETTER_THAN bs) {
79 bs = score;
80 }
81 }
82 return bs;
83}
84
85static void
86prune_hmms(state_align_search_t *sas, int frame_idx)
87{
88 int nf = frame_idx + 1;
89 int i;
90
91 /* Check all phones to see if they remain active in the next frame. */
92 for (i = 0; i < sas->n_phones; ++i) {
93 hmm_t *hmm = sas->hmms + i;
94 if (hmm_frame(hmm) < frame_idx)
95 continue;
96 hmm_frame(hmm) = nf;
97 }
98}
99
100static void
101phone_transition(state_align_search_t *sas, int frame_idx)
102{
103 int nf = frame_idx + 1;
104 int i;
105
106 for (i = 0; i < sas->n_phones - 1; ++i) {
107 hmm_t *hmm, *nhmm;
108 int32 newphone_score;
109
110 hmm = sas->hmms + i;
111 if (hmm_frame(hmm) != nf)
112 continue;
113
114 newphone_score = hmm_out_score(hmm);
115 /* Transition into next phone using the usual Viterbi rule. */
116 nhmm = hmm + 1;
117 if (hmm_frame(nhmm) < frame_idx
118 || newphone_score BETTER_THAN hmm_in_score(nhmm)) {
119 hmm_enter(nhmm, newphone_score, hmm_out_history(hmm), nf);
120 }
121 }
122}
123
124#define TOKEN_STEP 20
125static void
126extend_tokenstack(state_align_search_t *sas, int frame_idx)
127{
128 if (frame_idx >= sas->n_fr_alloc) {
129 sas->n_fr_alloc = frame_idx + TOKEN_STEP + 1;
130 sas->tokens = ckd_realloc(sas->tokens,
131 sas->n_emit_state * sas->n_fr_alloc
132 * sizeof(*sas->tokens));
133 }
134 memset(sas->tokens + frame_idx * sas->n_emit_state, 0xff,
135 sas->n_emit_state * sizeof(*sas->tokens));
136}
137
138static void
139record_transitions(state_align_search_t *sas, int frame_idx)
140{
141 state_align_hist_t *tokens;
142 int i;
143
144 /* Push another frame of tokens on the stack. */
145 extend_tokenstack(sas, frame_idx);
146 tokens = sas->tokens + frame_idx * sas->n_emit_state;
147
148 /* Scan all active HMMs */
149 for (i = 0; i < sas->n_phones; ++i) {
150 hmm_t *hmm = sas->hmms + i;
151 int j;
152
153 if (hmm_frame(hmm) < frame_idx)
154 continue;
155 for (j = 0; j < sas->hmmctx->n_emit_state; ++j) {
156 int state_idx = i * sas->hmmctx->n_emit_state + j;
157 /* Record their backpointers on the token stack. */
158 tokens[state_idx].id = hmm_history(hmm, j);
159 tokens[state_idx].score = hmm_score(hmm, j);
160 /* Update backpointer fields with state index. */
161 hmm_history(hmm, j) = state_idx;
162 }
163 }
164}
165
166static int
167state_align_search_step(ps_search_t *search, int frame_idx)
168{
170 acmod_t *acmod = ps_search_acmod(search);
171 int16 const *senscr;
172 int i;
173
174 /* Calculate senone scores. */
175 for (i = 0; i < sas->n_phones; ++i)
176 acmod_activate_hmm(acmod, sas->hmms + i);
177 senscr = acmod_score(acmod, &frame_idx);
178
179 /* Renormalize here if needed. */
180 /* FIXME: Make sure to (unit-)test this!!! */
181 if ((sas->best_score - 0x300000) WORSE_THAN WORST_SCORE) {
182 E_INFO("Renormalizing Scores at frame %d, best score %d\n",
183 frame_idx, sas->best_score);
184 renormalize_hmms(sas, frame_idx, sas->best_score);
185 }
186
187 /* Viterbi step. */
188 sas->best_score = evaluate_hmms(sas, senscr, frame_idx);
189 prune_hmms(sas, frame_idx);
190
191 /* Transition out of non-emitting states. */
192 phone_transition(sas, frame_idx);
193
194 /* Generate new tokens from best path results. */
195 record_transitions(sas, frame_idx);
196
197 /* Update frame counter */
198 sas->frame = frame_idx;
199
200 return 0;
201}
202
203static int
204state_align_search_finish(ps_search_t *search)
205{
207 hmm_t *final_phone = sas->hmms + sas->n_phones - 1;
210
211 int last_frame, cur_frame;
212 state_align_hist_t last, cur;
213
214 /* Best state exiting the last cur_frame. */
215 last.id = cur.id = hmm_out_history(final_phone);
216 last.score = hmm_out_score(final_phone);
217 if (last.id == 0xffff) {
218 E_ERROR("Failed to reach final state in alignment\n");
219 return -1;
220 }
221 itor = ps_alignment_states(sas->al);
222 last_frame = sas->frame + 1;
223 for (cur_frame = sas->frame - 1; cur_frame >= 0; --cur_frame) {
224 cur = sas->tokens[cur_frame * sas->n_emit_state + cur.id];
225 /* State boundary, update alignment entry for next state. */
226 if (cur.id != last.id) {
227 itor = ps_alignment_iter_goto(itor, last.id);
228 assert(itor != NULL);
229 ent = ps_alignment_iter_get(itor);
230 ent->start = cur_frame + 1;
231 ent->duration = last_frame - ent->start;
232 ent->score = last.score - cur.score;
233 E_DEBUG(1,("state %d start %d end %d\n", last.id,
234 ent->start, last_frame));
235 last = cur;
236 last_frame = cur_frame + 1;
237 }
238 }
239 /* Update alignment entry for initial state. */
240 itor = ps_alignment_iter_goto(itor, 0);
241 assert(itor != NULL);
242 ent = ps_alignment_iter_get(itor);
243 ent->start = 0;
244 ent->duration = last_frame;
245 E_DEBUG(1,("state %d start %d end %d\n", 0,
246 ent->start, last_frame));
249
250 return 0;
251}
252
253static int
254state_align_search_reinit(ps_search_t *search, dict_t *dict, dict2pid_t *d2p)
255{
256 /* This does nothing. */
257 return 0;
258}
259
260static void
261state_align_search_free(ps_search_t *search)
262{
264 ps_search_base_free(search);
265 ckd_free(sas->hmms);
266 ckd_free(sas->tokens);
267 hmm_context_free(sas->hmmctx);
268 ckd_free(sas);
269}
270
271static ps_searchfuncs_t state_align_search_funcs = {
272 /* start: */ state_align_search_start,
273 /* step: */ state_align_search_step,
274 /* finish: */ state_align_search_finish,
275 /* reinit: */ state_align_search_reinit,
276 /* free: */ state_align_search_free,
277 /* lattice: */ NULL,
278 /* hyp: */ NULL,
279 /* prob: */ NULL,
280 /* seg_iter: */ NULL,
281};
282
284state_align_search_init(const char *name,
285 cmd_ln_t *config,
286 acmod_t *acmod,
287 ps_alignment_t *al)
288{
291 hmm_t *hmm;
292
293 sas = ckd_calloc(1, sizeof(*sas));
294 ps_search_init(ps_search_base(sas), &state_align_search_funcs,
295 PS_SEARCH_TYPE_STATE_ALIGN, name,
296 config, acmod, al->d2p->dict, al->d2p);
297 sas->hmmctx = hmm_context_init(bin_mdef_n_emit_state(acmod->mdef),
298 acmod->tmat->tp, NULL, acmod->mdef->sseq);
299 if (sas->hmmctx == NULL) {
300 ckd_free(sas);
301 return NULL;
302 }
303 sas->al = al;
304
305 /* Generate HMM vector from phone level of alignment. */
308 sas->hmms = ckd_calloc(sas->n_phones, sizeof(*sas->hmms));
309 for (hmm = sas->hmms, itor = ps_alignment_phones(al); itor;
310 ++hmm, itor = ps_alignment_iter_next(itor)) {
312 hmm_init(sas->hmmctx, hmm, FALSE,
313 ent->id.pid.ssid, ent->id.pid.tmatid);
314 }
315 return ps_search_base(sas);
316}
void acmod_activate_hmm(acmod_t *acmod, hmm_t *hmm)
Activate senones associated with an HMM.
Definition: acmod.c:1213
int16 const * acmod_score(acmod_t *acmod, int *inout_frame_idx)
Score one frame of data.
Definition: acmod.c:1106
#define BETTER_THAN
Is one score better than another?
Definition: hmm.h:95
#define hmm_context_set_senscore(ctx, senscr)
Change the senone score array for a context.
Definition: hmm.h:227
#define WORST_SCORE
Large "bad" score.
Definition: hmm.h:84
#define WORSE_THAN
Is one score worse than another?
Definition: hmm.h:100
ps_alignment_iter_t * ps_alignment_states(ps_alignment_t *al)
Iterate over the alignment starting at the first state.
Definition: ps_alignment.c:397
ps_alignment_iter_t * ps_alignment_iter_next(ps_alignment_iter_t *itor)
Move an alignment iterator forward.
Definition: ps_alignment.c:437
ps_alignment_iter_t * ps_alignment_iter_goto(ps_alignment_iter_t *itor, int pos)
Move alignment iterator to given index.
Definition: ps_alignment.c:424
ps_alignment_entry_t * ps_alignment_iter_get(ps_alignment_iter_t *itor)
Get the alignment entry pointed to by an iterator.
Definition: ps_alignment.c:411
int ps_alignment_iter_free(ps_alignment_iter_t *itor)
Release an iterator before completing all iterations.
Definition: ps_alignment.c:417
int ps_alignment_n_phones(ps_alignment_t *al)
Number of phones.
Definition: ps_alignment.c:357
int ps_alignment_n_states(ps_alignment_t *al)
Number of states.
Definition: ps_alignment.c:363
ps_alignment_iter_t * ps_alignment_phones(ps_alignment_t *al)
Iterate over the alignment starting at the first phone.
Definition: ps_alignment.c:383
int ps_alignment_propagate(ps_alignment_t *al)
Propagate timing information up from state sequence.
Definition: ps_alignment.c:313
State (and phone and word) alignment search.
Acoustic model structure.
Definition: acmod.h:148
bin_mdef_t * mdef
Model definition.
Definition: acmod.h:159
tmat_t * tmat
Transition matrices.
Definition: acmod.h:160
uint16 ** sseq
Unique senone sequences (2D array built at load time)
Definition: bin_mdef.h:134
Building composite triphone (as well as word internal triphones) with the dictionary.
Definition: dict2pid.h:84
dict_t * dict
Dictionary this table refers to.
Definition: dict2pid.h:89
a structure for a dictionary.
Definition: dict.h:76
An individual HMM among the HMM search space.
Definition: ps_alignment.h:56
Base structure for search module.
V-table for search algorithm.
History structure.
Phone loop search structure.
int32 best_score
Best score in current frame.
state_align_hist_t * tokens
Tokens (backpointers) for state alignment.
ps_alignment_t * al
Alignment structure being operated on.
hmm_context_t * hmmctx
HMM context structure.
hmm_t * hmms
Vector of HMMs corresponding to phone level.
int n_emit_state
Number of emitting states (tokens per frame)
int n_fr_alloc
Number of frames of tokens allocated.
int frame
Current frame being processed.
int n_phones
Number of HMMs (phones).
uint8 *** tp
The transition matrices; kept in the same scale as acoustic scores; tp[tmatid][from-state][to-state].
Definition: tmat.h:56