/* * Copyright (C) 2008-2009 SVOX AG, Baslerstr. 30, 8048 Zuerich, Switzerland * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * @file picoklex.c * * Copyright (C) 2008-2009 SVOX AG, Baslerstr. 30, 8048 Zuerich, Switzerland * All rights reserved. * * History: * - 2009-04-20 -- initial version * */ #include "picoos.h" #include "picodbg.h" #include "picodata.h" #include "picoknow.h" #include "picoklex.h" #ifdef __cplusplus extern "C" { #endif #if 0 } #endif /* ************************************************************/ /* lexicon */ /* ************************************************************/ /** * @addtogroup picolex * overview: - lex consists of optional searchindex and a non-empty list of lexblocks - lexblocks are fixed size, at the start of a block there is also the start of an entry - using the searchindex a unambiguous lexblock can be determined which contains the entry (or there is no entry) - one lex entry has POS GRAPH PHON, all mandatory, but - PHON can be empty string -> no pronunciation in the resulting TTS output - PHON can be :G2P -> use G2P later to add pronunciation - (POS,GRAPH) is a uniq key (only one entry allowed) - (GRAPH) is almost a uniq key (2-4 entries with the same GRAPH, and differing POS and differing PHON possible) - for one graph we can have two or three solutions from the lex which all need to be passed on the the next PU - in this case GRAPH, POS, and PHON all must be available in lex sizing: - 3 bytes entry index -> 16MB addressable - 2 bytes searchindex nr -> 64K blocks possible - 5 bytes per searchindex entry - 3 bytes for graph-prefix - 2 bytes blockadr in searchindex -> 64K blocks possible - lexblock size 512B: - 32M possible - with ~20 bytes per entry -> max. average of ~26 entries to be searched per lookup - overhead of ~10 bytes per block to sync with block boundaries - examples: - 500KB lex -> 1000 blocks, 1000 entries in searchindex, ~25.6K lex-entries, - ~5KB searchindex ~10KB overhead for block sync - 100KB lex -> 200 blocks, 200 entries in searchindex, ~5.1K lex-entries, - ~1KB searchindex ~2KB overhead for block sync pil-file: lexicon knowledge base in binary form lex-kb = content content = searchindex {lexblock}1:NRBLOCKS2 lexblock = {lexentry}1: (lexblock size is fixed 512Bytes) searchindex = NRBLOCKS2 {GRAPH1 GRAPH1 GRAPH1 LEXBLOCKIND2}=NRBLOCKS2 lexentry = LENGRAPH1 {GRAPH1}=LENGRAPH1-1 LENPOSPHON1 POS1 {PHON1}=LENPOSPHON1-2 - special cases: - PHON is empty string (no pronunciation in the resulting TTS output): lexentry = LENGRAPH1 {GRAPH1}=LENGRAPH1-1 2 POS1 - PHON can be :G2P -> use G2P later to add pronunciation: lexentry = LENGRAPH1 {GRAPH1}=LENGRAPH1-1 3 POS1 - multi-byte values always little endian */ /* ************************************************************/ /* lexicon data defines */ /* may not be changed with current implementation */ /* ************************************************************/ /* nr bytes of nrblocks info */ #define PICOKLEX_LEX_NRBLOCKS_SIZE 2 /* search index entry: - nr graphs - nr bytes of block index - nr bytes per entry, NRGRAPHS*INDSIZE */ #define PICOKLEX_LEX_SIE_NRGRAPHS 3 #define PICOKLEX_LEX_SIE_INDSIZE 2 #define PICOKLEX_LEX_SIE_SIZE 5 /* nr of bytes per lexblock */ #define PICOKLEX_LEXBLOCK_SIZE 512 /* reserved values in klex to indicate :G2P needed for a lexentry */ #define PICOKLEX_NEEDS_G2P 5 /* ************************************************************/ /* lexicon type and loading */ /* ************************************************************/ /** object : LexKnowledgeBase * shortcut : klex * derived from : picoknow_KnowledgeBase */ typedef struct klex_subobj *klex_SubObj; typedef struct klex_subobj { picoos_uint16 nrblocks; /* nr lexblocks = nr eles in searchind */ picoos_uint8 *searchind; picoos_uint8 *lexblocks; } klex_subobj_t; static pico_status_t klexInitialize(register picoknow_KnowledgeBase this, picoos_Common common) { picoos_uint32 curpos = 0; klex_subobj_t *klex; PICODBG_DEBUG(("start")); /* check whether (this->size != 0) done before calling this function */ if (NULL == this || NULL == this->subObj) { return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING, NULL, NULL); } klex = (klex_subobj_t *) this->subObj; if (PICO_OK == picoos_read_mem_pi_uint16(this->base, &curpos, &(klex->nrblocks))) { if (klex->nrblocks > 0) { PICODBG_DEBUG(("nr blocks: %i, curpos: %i", klex->nrblocks,curpos)); klex->searchind = this->base + curpos; } else { klex->searchind = NULL; } klex->lexblocks = this->base + PICOKLEX_LEX_NRBLOCKS_SIZE + (klex->nrblocks * (PICOKLEX_LEX_SIE_SIZE)); return PICO_OK; } else { return picoos_emRaiseException(common->em, PICO_EXC_FILE_CORRUPT, NULL, NULL); } } static pico_status_t klexSubObjDeallocate(register picoknow_KnowledgeBase this, picoos_MemoryManager mm) { if (NULL != this) { picoos_deallocate(mm, (void *) &this->subObj); } return PICO_OK; } /* we don't offer a specialized constructor for a LexKnowledgeBase but * instead a "specializer" of an allready existing generic * picoknow_KnowledgeBase */ pico_status_t picoklex_specializeLexKnowledgeBase(picoknow_KnowledgeBase this, picoos_Common common) { if (NULL == this) { return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING, NULL, NULL); } if (this->size > 0) { this->subDeallocate = klexSubObjDeallocate; this->subObj = picoos_allocate(common->mm, sizeof(klex_subobj_t)); if (NULL == this->subObj) { return picoos_emRaiseException(common->em, PICO_EXC_OUT_OF_MEM, NULL, NULL); } return klexInitialize(this, common); } else { /* some dummy klex */ return PICO_OK; } } /* for now we don't need to do anything special for the main lex */ /* pico_status_t picoklex_specializeMainLexKnowledgeBase( picoknow_KnowledgeBase this, picoos_Common common) { return picoklex_specializeLexKnowledgeBase(this,common); } */ /* ************************************************************/ /* lexicon getLex */ /* ************************************************************/ picoklex_Lex picoklex_getLex(picoknow_KnowledgeBase this) { if (NULL == this) { return NULL; } else { return (picoklex_Lex) this->subObj; } } /* ************************************************************/ /* functions on searchindex */ /* ************************************************************/ static picoos_uint32 klex_getSearchIndexVal(const klex_SubObj this, picoos_uint16 index) { picoos_uint32 pos, val; pos = index * PICOKLEX_LEX_SIE_SIZE; val = this->searchind[pos]; val = (val << 8) + this->searchind[pos + 1]; val = (val << 8) + this->searchind[pos + 2]; return val; } /* Determine first lexblock containing entries for specified grapheme. */ static picoos_uint16 klex_getLexblockNr(const klex_SubObj this, const picoos_uint8 *graphsi) { /* graphsi is of len PICOKLEX_LEX_SI_NGRAPHS */ picoos_int32 low, mid, high; picoos_uint32 searchval, indval; /* PICOKLEX_LEX_SIE_NRGRAPHS */ /* convert graph-prefix to number with 'lexicographic' ordering */ searchval = graphsi[0]; searchval = (searchval << 8) + graphsi[1]; searchval = (searchval << 8) + graphsi[2]; low = 0; high = this->nrblocks; /* do binary search */ while (low < high) { mid = (low + high) / 2; indval = klex_getSearchIndexVal(this, mid); if (indval < searchval) { low = mid + 1; } else { high = mid; } } PICODBG_ASSERT(high == low); /* low points to the first entry greater than or equal to searchval */ if (low < this->nrblocks) { indval = klex_getSearchIndexVal(this, low); if (indval > searchval) { low--; /* if there are identical elements in the search index we have to move to the first one */ if (low > 0) { indval = klex_getSearchIndexVal(this, low); while (indval == klex_getSearchIndexVal(this, low-1)) { low--; } } } } else { low = this->nrblocks - 1; } #if defined(PICO_DEBUG) { picoos_uint32 pos = low * PICOKLEX_LEX_SIE_SIZE; PICODBG_DEBUG(("binary search result is %c%c%c (%d)", this->searchind[pos], this->searchind[pos + 1], this->searchind[pos + 2], low)); } #endif return (picoos_uint16) low; } /* Determine number of adjacent lexblocks containing entries for the same grapheme search prefix (identified by search index). */ static picoos_uint16 klex_getLexblockRange(const klex_SubObj this, picoos_uint16 index) { picoos_uint16 count; picoos_uint32 sval1, sval2; sval1 = klex_getSearchIndexVal(this, index); #if defined(PICO_DEBUG) /* 'index' must point to first lexblock of its kind */ if (index > 0) { sval2 = klex_getSearchIndexVal(this, index - 1); PICODBG_ASSERT(sval1 != sval2); } #endif index++; sval2 = klex_getSearchIndexVal(this, index); count = 1; while (sval1 == sval2) { count++; index++; sval2 = klex_getSearchIndexVal(this, index); } return count; } /* ************************************************************/ /* functions on single lexblock */ /* ************************************************************/ static picoos_int8 klex_lexMatch(picoos_uint8 *lexentry, const picoos_uint8 *graph, const picoos_uint16 graphlen) { picoos_uint8 i; picoos_uint8 lexlen; picoos_uint8 *lexgraph; lexlen = lexentry[0] - 1; lexgraph = &(lexentry[1]); for (i=0; (i graph[i]) { return 1; } } if (graphlen == lexlen) { return 0; } else if (lexlen < graphlen) { return -1; } else { return 1; } } static void klex_setLexResult(const picoos_uint8 *lexentry, const picoos_uint32 lexpos, picoklex_lexl_result_t *lexres) { picoos_uint8 i; /* check if :G2P */ if ((2 < (lexentry[lexentry[0]])) && ((lexentry[lexentry[0] + 2]) == PICOKLEX_NEEDS_G2P)) { /* set pos */ lexres->posind[0] = lexentry[lexentry[0] + 1]; /* set rest */ lexres->phonfound = FALSE; lexres->posindlen = 1; lexres->nrres = 1; PICODBG_DEBUG(("result %d :G2P", lexres->nrres)); } else { i = lexres->nrres * (PICOKLEX_POSIND_SIZE); lexres->posindlen += PICOKLEX_POSIND_SIZE; lexres->phonfound = TRUE; /* set pos */ lexres->posind[i++] = lexentry[lexentry[0] + 1]; /* set ind, PICOKLEX_IND_SIZE */ lexres->posind[i++] = 0x000000ff & (lexpos); lexres->posind[i++] = 0x000000ff & (lexpos >> 8); lexres->posind[i] = 0x000000ff & (lexpos >> 16); lexres->nrres++; PICODBG_DEBUG(("result %d", lexres->nrres)); } } static void klex_lexblockLookup(klex_SubObj this, const picoos_uint32 lexposStart, const picoos_uint32 lexposEnd, const picoos_uint8 *graph, const picoos_uint16 graphlen, picoklex_lexl_result_t *lexres) { picoos_uint32 lexpos; picoos_int8 rv; lexres->nrres = 0; lexpos = lexposStart; rv = -1; while ((rv < 0) && (lexpos < lexposEnd)) { rv = klex_lexMatch(&(this->lexblocks[lexpos]), graph, graphlen); if (rv == 0) { /* found */ klex_setLexResult(&(this->lexblocks[lexpos]), lexpos, lexres); if (lexres->phonfound) { /* look for more results, up to MAX_NRRES, don't even check if more results would be available */ while ((lexres->nrres < PICOKLEX_MAX_NRRES) && (lexpos < lexposEnd)) { lexpos += this->lexblocks[lexpos]; lexpos += this->lexblocks[lexpos]; /* if there are no more entries in this block, advance to next block by skipping all zeros */ while ((this->lexblocks[lexpos] == 0) && (lexpos < lexposEnd)) { lexpos++; } if (lexpos < lexposEnd) { if (klex_lexMatch(&(this->lexblocks[lexpos]), graph, graphlen) == 0) { klex_setLexResult(&(this->lexblocks[lexpos]), lexpos, lexres); } else { /* no more results, quit loop */ lexpos = lexposEnd; } } } } else { /* :G2P mark */ } } else if (rv < 0) { /* not found, goto next entry */ lexpos += this->lexblocks[lexpos]; lexpos += this->lexblocks[lexpos]; /* if there are no more entries in this block, advance to next block by skipping all zeros */ while ((this->lexblocks[lexpos] == 0) && (lexpos < lexposEnd)) { lexpos++; } } else { /* rv > 0, not found, won't show up later in block */ } } } /* ************************************************************/ /* lexicon lookup functions */ /* ************************************************************/ picoos_uint8 picoklex_lexLookup(const picoklex_Lex this, const picoos_uint8 *graph, const picoos_uint16 graphlen, picoklex_lexl_result_t *lexres) { picoos_uint16 lbnr, lbc; picoos_uint32 lexposStart, lexposEnd; picoos_uint8 i; picoos_uint8 tgraph[PICOKLEX_LEX_SIE_NRGRAPHS]; klex_SubObj klex = (klex_SubObj) this; if (NULL == klex) { PICODBG_ERROR(("no lexicon loaded")); /* no exception here needed, already checked at initialization */ return FALSE; } lexres->nrres = 0; lexres->posindlen = 0; lexres->phonfound = FALSE; for (i = 0; inrblocks) == 0) { /* no searchindex, no lexblock */ PICODBG_WARN(("no searchindex, no lexblock")); return FALSE; } else { lbnr = klex_getLexblockNr(klex, tgraph); PICODBG_ASSERT(lbnr < klex->nrblocks); lbc = klex_getLexblockRange(klex, lbnr); PICODBG_ASSERT((lbc >= 1) && (lbc <= klex->nrblocks)); } PICODBG_DEBUG(("lexblock nr: %d (#%d)", lbnr, lbc)); lexposStart = lbnr * PICOKLEX_LEXBLOCK_SIZE; lexposEnd = lexposStart + lbc * PICOKLEX_LEXBLOCK_SIZE; PICODBG_DEBUG(("lookup start, lexpos range %d..%d", lexposStart,lexposEnd)); klex_lexblockLookup(klex, lexposStart, lexposEnd, graph, graphlen, lexres); PICODBG_DEBUG(("lookup done, %d found", lexres->nrres)); return (lexres->nrres > 0); } picoos_uint8 picoklex_lexIndLookup(const picoklex_Lex this, const picoos_uint8 *ind, const picoos_uint8 indlen, picoos_uint8 *pos, picoos_uint8 **phon, picoos_uint8 *phonlen) { picoos_uint32 pentry; klex_SubObj klex = (klex_SubObj) this; /* check indlen */ if (indlen != PICOKLEX_IND_SIZE) { return FALSE; } /* PICOKLEX_IND_SIZE */ pentry = 0x000000ff & (ind[0]); pentry |= ((picoos_uint32)(ind[1]) << 8); pentry |= ((picoos_uint32)(ind[2]) << 16); /* check ind if it is within lexblocks byte stream, if not, return FALSE */ if (pentry >= ((picoos_uint32)klex->nrblocks * PICOKLEX_LEXBLOCK_SIZE)) { return FALSE; } pentry += (klex->lexblocks[pentry]); *phonlen = (klex->lexblocks[pentry++]) - 2; *pos = klex->lexblocks[pentry++]; *phon = &(klex->lexblocks[pentry]); PICODBG_DEBUG(("pentry: %d, phonlen: %d", pentry, *phonlen)); return TRUE; } #ifdef __cplusplus } #endif /* end */