diff options
Diffstat (limited to 'lib/picoktab.c')
-rw-r--r-- | lib/picoktab.c | 1118 |
1 files changed, 1118 insertions, 0 deletions
diff --git a/lib/picoktab.c b/lib/picoktab.c new file mode 100644 index 0000000..ca8b470 --- /dev/null +++ b/lib/picoktab.c @@ -0,0 +1,1118 @@ +/* + * Copyright (C) 2008-2009 SVOX AG, Baslerstr. 30, 8048 Zuerich, Switzerland + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * @file picoktab.c + * + * symbol tables needed at runtime + * + * Copyright (C) 2008-2009 SVOX AG, Baslerstr. 30, 8048 Zuerich, Switzerland + * All rights reserved. + * + * History: + * - 2009-04-20 -- initial version + * + */ + +#include "picoos.h" +#include "picodbg.h" +#include "picoknow.h" +#include "picobase.h" +#include "picoktab.h" +#include "picodata.h" + +#ifdef __cplusplus +extern "C" { +#endif +#if 0 +} +#endif + + +/** @todo : the following would be better part of a knowledge base. + * Make sure it is consistent with the phoneme symbol table used in the lingware */ + +/* PLANE_PHONEMES */ + +/* PLANE_POS */ + +/* PLANE_PB_STRENGTHS */ + +/* PLANE_ACCENTS */ + +/* PLANE_INTERN */ +#define PICOKTAB_TMPID_PHONSTART '\x26' /* 38 '&' */ +#define PICOKTAB_TMPID_PHONTERM '\x23' /* 35 '#' */ + + +/* ************************************************************/ +/* fixed ids */ +/* ************************************************************/ + + +static pico_status_t ktabIdsInitialize(register picoknow_KnowledgeBase this, + picoos_Common common) +{ + picoktab_FixedIds ids; + + PICODBG_DEBUG(("start")); + + if (NULL == this || NULL == this->subObj) { + return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING, + NULL, NULL); + } + ids = (picoktab_FixedIds) this->subObj; + + ids->phonStartId = PICOKTAB_TMPID_PHONSTART; + ids->phonTermId = PICOKTAB_TMPID_PHONTERM; + return PICO_OK; +} + + +static pico_status_t ktabIdsSubObjDeallocate(register picoknow_KnowledgeBase this, + picoos_MemoryManager mm) +{ + if (NULL != this) { + picoos_deallocate(mm, (void *) &this->subObj); + } + return PICO_OK; +} + +pico_status_t picoktab_specializeIdsKnowledgeBase(picoknow_KnowledgeBase this, + picoos_Common common) +{ + if (NULL == this) { + return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING, + NULL, NULL); + } + this->subDeallocate = ktabIdsSubObjDeallocate; + this->subObj = picoos_allocate(common->mm, sizeof(picoktab_fixed_ids_t)); + if (NULL == this->subObj) { + return picoos_emRaiseException(common->em, PICO_EXC_OUT_OF_MEM, + NULL, NULL); + } + return ktabIdsInitialize(this, common); +} + +picoktab_FixedIds picoktab_getFixedIds(picoknow_KnowledgeBase this) +{ + return ((NULL == this) ? NULL : ((picoktab_FixedIds) this->subObj)); +} + + +picoktab_FixedIds picoktab_newFixedIds(picoos_MemoryManager mm) +{ + picoktab_FixedIds this = (picoktab_FixedIds) picoos_allocate(mm,sizeof(*this)); + if (NULL != this) { + /* initialize */ + } + return this; +} + + +void picoktab_disposeFixedIds(picoos_MemoryManager mm, picoktab_FixedIds * this) +{ + if (NULL != (*this)) { + /* terminate */ + picoos_deallocate(mm,(void *)this); + } +} + + + +/* ************************************************************/ +/* Graphs */ +/* ************************************************************/ + +/* overview binary file format for graphs kb: + + graphs-kb = NROFSENTRIES SIZEOFSENTRY ofstable graphs + + NROFSENTRIES : 2 bytes, number of entries in offset table + SIZEOFSENTRY : 1 byte, size of one entry in offset table + + ofstable = {OFFSET}=NROFSENTRIES (contains NROFSENTRIES entries of OFFSET) + + OFFSET: SIZEOFSENTRY bytes, offset to baseaddress of graphs-kb to entry in graphs + + graphs = {graph}=NROFSENTRIES (contains NROFSENTRIES entries of graph) + + graph = PROPSET FROM TO [TOKENTYPE] [TOKENSUBTYPE] [VALUE] [LOWERCASE] [GRAPHSUBS1] [GRAPHSUBS2] + + FROM : 1..4 unsigned bytes, UTF8 character without terminating 0 + TO : 1..4 unsigned bytes, UTF8 character without terminating 0 + PROPSET : 1 unsigned byte, least significant bit : has TO field + next bit : has TOKENTYPE + next bit : has TOKENSUBTYPE + next bit : has VALUE + next bit : has LOWERCASE + next bit : has GRAPHSUBS1 + next bit : has GRAPHSUBS2 + next bit : has PUNC + + TOKENTYPE : 1 unsigned byte + TOKENSUBTYPE : 1 unsigned byte + VALUE : 1 unsigned byte + LOWERCASE : 1..4 unsigned bytes, UTF8 character without terminating 0 + GRAPHSUBS1 : 1..4 unsigned bytes, UTF8 character without terminating 0 + GRAPHSUBS2 : 1..4 unsigned bytes, UTF8 character without terminating 0 + PUNC : 1 unsigned byte +*/ + +static picoos_uint32 ktab_propOffset (const picoktab_Graphs this, picoos_uint32 graphsOffset, picoos_uint32 prop); + +#define KTAB_START_GRAPHS_NR_OFFSET 0 +#define KTAB_START_GRAPHS_SIZE_OFFSET 2 +#define KTAB_START_GRAPHS_OFFSET_TABLE 3 +#define KTAB_START_GRAPHS_GRAPH_TABLE 0 + +/* bitmasks to extract the grapheme properties info from the property set */ +#define KTAB_GRAPH_PROPSET_TO ((picoos_uint8)'\x01') +#define KTAB_GRAPH_PROPSET_TOKENTYPE ((picoos_uint8)'\x02') +#define KTAB_GRAPH_PROPSET_TOKENSUBTYPE ((picoos_uint8)'\x04') +#define KTAB_GRAPH_PROPSET_VALUE ((picoos_uint8)'\x08') +#define KTAB_GRAPH_PROPSET_LOWERCASE ((picoos_uint8)'\x010') +#define KTAB_GRAPH_PROPSET_GRAPHSUBS1 ((picoos_uint8)'\x020') +#define KTAB_GRAPH_PROPSET_GRAPHSUBS2 ((picoos_uint8)'\x040') +#define KTAB_GRAPH_PROPSET_PUNCT ((picoos_uint8)'\x080') + + +typedef struct ktabgraphs_subobj *ktabgraphs_SubObj; + +typedef struct ktabgraphs_subobj { + picoos_uint16 nrOffset; + picoos_uint16 sizeOffset; + + picoos_uint8 * offsetTable; + picoos_uint8 * graphTable; +} ktabgraphs_subobj_t; + + + +static pico_status_t ktabGraphsInitialize(register picoknow_KnowledgeBase this, + picoos_Common common) { + ktabgraphs_subobj_t * ktabgraphs; + + PICODBG_DEBUG(("start")); + + if (NULL == this || NULL == this->subObj) { + return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING, + NULL, NULL); + } + ktabgraphs = (ktabgraphs_subobj_t *) this->subObj; + ktabgraphs->nrOffset = ((int)(this->base[KTAB_START_GRAPHS_NR_OFFSET])) + 256*(int)(this->base[KTAB_START_GRAPHS_NR_OFFSET+1]); + ktabgraphs->sizeOffset = (int)(this->base[KTAB_START_GRAPHS_SIZE_OFFSET]); + ktabgraphs->offsetTable = &(this->base[KTAB_START_GRAPHS_OFFSET_TABLE]); + ktabgraphs->graphTable = &(this->base[KTAB_START_GRAPHS_GRAPH_TABLE]); + return PICO_OK; +} + +static pico_status_t ktabGraphsSubObjDeallocate(register picoknow_KnowledgeBase this, + picoos_MemoryManager mm) { + if (NULL != this) { + picoos_deallocate(mm, (void *) &this->subObj); + } + return PICO_OK; +} + + +pico_status_t picoktab_specializeGraphsKnowledgeBase(picoknow_KnowledgeBase this, + picoos_Common common) { + if (NULL == this) { + return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING, + NULL, NULL); + } + this->subDeallocate = ktabGraphsSubObjDeallocate; + this->subObj = picoos_allocate(common->mm, sizeof(ktabgraphs_subobj_t)); + if (NULL == this->subObj) { + return picoos_emRaiseException(common->em, PICO_EXC_OUT_OF_MEM, + NULL, NULL); + } + return ktabGraphsInitialize(this, common); +} + + +picoktab_Graphs picoktab_getGraphs(picoknow_KnowledgeBase this) { + if (NULL == this) { + return NULL; + } else { + return (picoktab_Graphs) this->subObj; + } +} + + +/* Graphs methods */ + +picoos_uint8 picoktab_hasVowellikeProp(const picoktab_Graphs this, + const picoos_uint8 *graph, + const picoos_uint8 graphlenmax) { + + picoos_uint8 ui8App; + picoos_uint32 graphsOffset; + ktabgraphs_subobj_t * g = (ktabgraphs_SubObj)this; + + ui8App = graphlenmax; /* avoid warning "var not used in this function"*/ + + graphsOffset = picoktab_graphOffset (this, (picoos_uchar *)graph); + return g->graphTable[graphsOffset + ktab_propOffset (this, graphsOffset, KTAB_GRAPH_PROPSET_TOKENTYPE)] == PICODATA_ITEMINFO1_TOKTYPE_LETTERV; +} + + +static void ktab_getStrProp (const picoktab_Graphs this, picoos_uint32 graphsOffset, picoos_uint32 propOffset, picoos_uchar * str) +{ + ktabgraphs_subobj_t * g = (ktabgraphs_SubObj)this; + picoos_uint32 i, l; + + i = 0; + l = picobase_det_utf8_length(g->graphTable[graphsOffset+propOffset]); + while (i<l) { + str[i] = g->graphTable[graphsOffset+propOffset+i]; + i++; + } + str[l] = 0; +} + + +static picoos_uint32 ktab_propOffset(const picoktab_Graphs this, + picoos_uint32 graphsOffset, picoos_uint32 prop) +/* Returns offset of property 'prop' inside the graph with offset 'graphsOffset' in graphs table; + If the property is found, a value > 0 is returned otherwise 0 */ +{ + picoos_uint32 n = 0; + ktabgraphs_subobj_t * g = (ktabgraphs_SubObj) this; + + if ((g->graphTable[graphsOffset] & prop) == prop) { + n = n + 1; /* overread PROPSET field */ + n = n + picobase_det_utf8_length(g->graphTable[graphsOffset+n]); /* overread FROM field */ + if (prop > KTAB_GRAPH_PROPSET_TO) { + if ((g->graphTable[graphsOffset] & KTAB_GRAPH_PROPSET_TO) + == KTAB_GRAPH_PROPSET_TO) { + n = n + picobase_det_utf8_length(g->graphTable[graphsOffset+n]); /* overread TO field */ + } + } else { + return n; + } + if (prop > KTAB_GRAPH_PROPSET_TOKENTYPE) { + if ((g->graphTable[graphsOffset] & KTAB_GRAPH_PROPSET_TOKENTYPE) + == KTAB_GRAPH_PROPSET_TOKENTYPE) { + n = n + 1; /* overread TOKENTYPE field */ + } + } else { + return n; + } + if (prop > KTAB_GRAPH_PROPSET_TOKENSUBTYPE) { + if ((g->graphTable[graphsOffset] & KTAB_GRAPH_PROPSET_TOKENSUBTYPE) + == KTAB_GRAPH_PROPSET_TOKENSUBTYPE) { + n = n + 1; /* overread stokentype field */ + } + } else { + return n; + } + if (prop > KTAB_GRAPH_PROPSET_VALUE) { + if ((g->graphTable[graphsOffset] & KTAB_GRAPH_PROPSET_VALUE) + == KTAB_GRAPH_PROPSET_VALUE) { + n = n + 1; /* overread value field */ + } + } else { + return n; + } + if (prop > KTAB_GRAPH_PROPSET_LOWERCASE) { + if ((g->graphTable[graphsOffset] & KTAB_GRAPH_PROPSET_LOWERCASE) + == KTAB_GRAPH_PROPSET_LOWERCASE) { + n = n + picobase_det_utf8_length(g->graphTable[graphsOffset+n]); /* overread lowercase field */ + } + } else { + return n; + } + if (prop > KTAB_GRAPH_PROPSET_GRAPHSUBS1) { + if ((g->graphTable[graphsOffset] & KTAB_GRAPH_PROPSET_GRAPHSUBS1) + == KTAB_GRAPH_PROPSET_GRAPHSUBS1) { + n = n + picobase_det_utf8_length(g->graphTable[graphsOffset+n]); /* overread graphsubs1 field */ + } + } else { + return n; + } + if (prop > KTAB_GRAPH_PROPSET_GRAPHSUBS2) { + if ((g->graphTable[graphsOffset] & KTAB_GRAPH_PROPSET_GRAPHSUBS2) + == KTAB_GRAPH_PROPSET_GRAPHSUBS2) { + n = n + picobase_det_utf8_length(g->graphTable[graphsOffset+n]); /* overread graphsubs2 field */ + } + } else { + return n; + } + if (prop > KTAB_GRAPH_PROPSET_PUNCT) { + if ((g->graphTable[graphsOffset] & KTAB_GRAPH_PROPSET_PUNCT) + == KTAB_GRAPH_PROPSET_PUNCT) { + n = n + 1; /* overread value field */ + } + } else { + return n; + } + } + + return n; +} + + +picoos_uint32 picoktab_graphOffset (const picoktab_Graphs this, picoos_uchar * utf8graph) +{ ktabgraphs_subobj_t * g = (ktabgraphs_SubObj)this; + picoos_int32 a, b, m; + picoos_uint32 graphsOffset; + picoos_uint32 propOffset; + picobase_utf8char from; + picobase_utf8char to; + picoos_bool utfGEfrom; + picoos_bool utfLEto; + + if (g->nrOffset > 0) { + a = 0; + b = g->nrOffset-1; + do { + m = (a+b) / 2; + + /* get offset to graph[m] */ + if (g->sizeOffset == 1) { + graphsOffset = g->offsetTable[g->sizeOffset*m]; + } + else { + graphsOffset = g->offsetTable[g->sizeOffset*m ] + + 256*g->offsetTable[g->sizeOffset*m + 1]; + /* PICODBG_DEBUG(("picoktab_graphOffset: %i %i %i %i", m, g->offsetTable[g->sizeOffset*m], g->offsetTable[g->sizeOffset*m + 1], graphsOffset)); + */ + } + + /* get FROM and TO field of graph[m] */ + ktab_getStrProp(this, graphsOffset, 1, from); + propOffset = ktab_propOffset(this, graphsOffset, KTAB_GRAPH_PROPSET_TO); + if (propOffset > 0) { + ktab_getStrProp(this, graphsOffset, propOffset, to); + } + else { + picoos_strcpy((picoos_char *)to, (picoos_char *)from); + } + + /* PICODBG_DEBUG(("picoktab_graphOffset: %i %i %i '%s' '%s' '%s'", a, m, b, from, utf8graph, to)); + */ + utfGEfrom = picoos_strcmp((picoos_char *)utf8graph, (picoos_char *)from) >= 0; + utfLEto = picoos_strcmp((picoos_char *)utf8graph, (picoos_char *)to) <= 0; + + if (utfGEfrom && utfLEto) { + /* PICODBG_DEBUG(("picoktab_graphOffset: utf char '%s' found", utf8graph)); + */ + return graphsOffset; + } + if (!utfGEfrom) { + b = m-1; + } + else if (!utfLEto) { + a = m+1; + } + } while (a<=b); + } + PICODBG_DEBUG(("picoktab_graphOffset: utf char '%s' not found", utf8graph)); + return 0; +} + + + + +picoos_bool picoktab_getIntPropTokenType (const picoktab_Graphs this, picoos_uint32 graphsOffset, picoos_uint8 * stokenType) +{ + picoos_uint32 propOffset; + ktabgraphs_subobj_t * g = (ktabgraphs_SubObj)this; + + propOffset = ktab_propOffset(this, graphsOffset, KTAB_GRAPH_PROPSET_TOKENTYPE); + if (propOffset > 0) { + *stokenType = (picoos_uint8)(g->graphTable[graphsOffset+propOffset]); + return TRUE; + } + else { + return FALSE; + } +} + + +picoos_bool picoktab_getIntPropTokenSubType (const picoktab_Graphs this, picoos_uint32 graphsOffset, picoos_int8 * stokenSubType) +{ + picoos_uint32 propOffset; + ktabgraphs_subobj_t * g = (ktabgraphs_SubObj)this; + + propOffset = ktab_propOffset(this, graphsOffset, KTAB_GRAPH_PROPSET_TOKENSUBTYPE); + if (propOffset > 0) { + *stokenSubType = (picoos_int8)(g->graphTable[graphsOffset+propOffset]); + return TRUE; + } + else { + return FALSE; + } +} + +picoos_bool picoktab_getIntPropValue (const picoktab_Graphs this, picoos_uint32 graphsOffset, picoos_uint32 * value) +{ + picoos_uint32 propOffset; + ktabgraphs_subobj_t * g = (ktabgraphs_SubObj)this; + + propOffset = ktab_propOffset(this, graphsOffset, KTAB_GRAPH_PROPSET_VALUE); + if (propOffset > 0) { + *value = (picoos_uint32)(g->graphTable[graphsOffset+propOffset]); + return TRUE; + } + else { + return FALSE; + } +} + + +picoos_bool picoktab_getIntPropPunct (const picoktab_Graphs this, picoos_uint32 graphsOffset, picoos_uint8 * info1, picoos_uint8 * info2) +{ + picoos_uint32 propOffset; + ktabgraphs_subobj_t * g = (ktabgraphs_SubObj)this; + + propOffset = ktab_propOffset(this, graphsOffset, KTAB_GRAPH_PROPSET_PUNCT); + if (propOffset > 0) { + if (g->graphTable[graphsOffset+propOffset] == 2) { + *info1 = PICODATA_ITEMINFO1_PUNC_SENTEND; + } + else { + *info1 = PICODATA_ITEMINFO1_PUNC_PHRASEEND; + } + if (g->graphTable[graphsOffset+1] == '.') { + *info2 = PICODATA_ITEMINFO2_PUNC_SENT_T; + } + else if (g->graphTable[graphsOffset+1] == '?') { + *info2 = PICODATA_ITEMINFO2_PUNC_SENT_Q; + } + else if (g->graphTable[graphsOffset+1] == '!') { + *info2 = PICODATA_ITEMINFO2_PUNC_SENT_E; + } + else { + *info2 = PICODATA_ITEMINFO2_PUNC_PHRASE; + } + return TRUE; + } + else { + return FALSE; + } +} + + +picoos_bool picoktab_getStrPropLowercase (const picoktab_Graphs this, picoos_uint32 graphsOffset, picoos_uchar * lowercase) +{ + picoos_uint32 propOffset; + + propOffset = ktab_propOffset(this, graphsOffset, KTAB_GRAPH_PROPSET_LOWERCASE); + if (propOffset > 0) { + ktab_getStrProp(this, graphsOffset, propOffset, lowercase); + return TRUE; + } + else { + return FALSE; + } +} + + +picoos_bool picoktab_getStrPropGraphsubs1 (const picoktab_Graphs this, picoos_uint32 graphsOffset, picoos_uchar * graphsubs1) +{ + picoos_uint32 propOffset; + + propOffset = ktab_propOffset(this, graphsOffset, KTAB_GRAPH_PROPSET_GRAPHSUBS1); + if (propOffset > 0) { + ktab_getStrProp(this, graphsOffset, propOffset, graphsubs1); + return TRUE; + } + else { + return FALSE; + } +} + + +picoos_bool picoktab_getStrPropGraphsubs2 (const picoktab_Graphs this, picoos_uint32 graphsOffset, picoos_uchar * graphsubs2) +{ + picoos_uint32 propOffset; + + propOffset = ktab_propOffset(this, graphsOffset, KTAB_GRAPH_PROPSET_GRAPHSUBS2); + if (propOffset > 0) { + ktab_getStrProp(this, graphsOffset, propOffset, graphsubs2); + return TRUE; + } + else { + return FALSE; + } +} +/* *****************************************************************/ +/* used for tools */ + +static void ktab_getUtf8 (picoos_uchar ** pos, picoos_uchar * to) +{ + picoos_uint32 l; + l = picobase_det_utf8_length(**pos); + while (l>0) { + *(to++) = *((*pos)++); + l--; + } + *to = 0; +} + +picoos_uint16 picoktab_graphsGetNumEntries(const picoktab_Graphs this) +{ + ktabgraphs_subobj_t * g = (ktabgraphs_SubObj) this; + return g->nrOffset; +} + +void picoktab_graphsGetGraphInfo(const picoktab_Graphs this, + picoos_uint16 graphIndex, picoos_uchar * from, picoos_uchar * to, + picoos_uint8 * propset, + picoos_uint8 * stokenType, picoos_uint8 * stokenSubType, + picoos_uint8 * value, picoos_uchar * lowercase, + picoos_uchar * graphsubs1, picoos_uchar * graphsubs2, + picoos_uint8 * punct) { + ktabgraphs_subobj_t * g = (ktabgraphs_SubObj) this; + picoos_uint32 graphsOffset; + picoos_uint8 * pos; + + /* calculate offset of graph[graphIndex] */ + if (g->sizeOffset == 1) { + graphsOffset = g->offsetTable[graphIndex]; + } else { + graphsOffset = g->offsetTable[2 * graphIndex] + + (g->offsetTable[2 * graphIndex + 1] << 8); + } + pos = &(g->graphTable[graphsOffset]); + *propset = *pos; + + pos++; /* advance to FROM */ + ktab_getUtf8(&pos, from); /* get FROM and advance */ + if ((*propset) & KTAB_GRAPH_PROPSET_TO) { + ktab_getUtf8(&pos, to); /* get TO and advance */ + } else { + picoos_strcpy((picoos_char *)to, (picoos_char *)from); + } + if ((*propset) & KTAB_GRAPH_PROPSET_TOKENTYPE) { + (*stokenType) = *(pos++); /* get TOKENTYPE and advance */ + } else { + (*stokenType) = -1; + } + if ((*propset) & KTAB_GRAPH_PROPSET_TOKENSUBTYPE) { + (*stokenSubType) = *(pos++); /* get TOKENSUBTYPE and advance */ + } else { + (*stokenSubType) = -1; + } + if ((*propset) & KTAB_GRAPH_PROPSET_VALUE) { + (*value) = *(pos++); /* get VALUE and advance */ + } else { + (*value) = -1; + } + if ((*propset) & KTAB_GRAPH_PROPSET_LOWERCASE) { + ktab_getUtf8(&pos, lowercase); /* get LOWERCASE and advance */ + } else { + lowercase[0] = NULLC; + } + if ((*propset) & KTAB_GRAPH_PROPSET_GRAPHSUBS1) { + ktab_getUtf8(&pos, graphsubs1); /* get GRAPHSUBS1 and advance */ + } else { + graphsubs1[0] = NULLC; + } + if ((*propset) & KTAB_GRAPH_PROPSET_GRAPHSUBS2) { + ktab_getUtf8(&pos, graphsubs2); /* get GRAPHSUBS2 and advance */ + } else { + graphsubs2[0] = NULLC; + } + if ((*propset) & KTAB_GRAPH_PROPSET_PUNCT) { + (*punct) = *(pos++); /* get PUNCT and advance */ + } else { + (*punct) = -1; + } +} + +/* ************************************************************/ +/* Phones */ +/* ************************************************************/ + +/* overview binary file format for phones kb: + + phones-kb = specids propertytable + + specids = PRIMSTRESSID1 SECSTRESSID1 SYLLBOUNDID1 PAUSEID1 WORDBOUNDID1 + RESERVE1 RESERVE1 RESERVE1 + + propertytable = {PHONEPROP2}=256 + + PRIMSTRESSID1: one byte, ID of primary stress + SECSTRESSID1: one byte, ID of secondary stress + SYLLBOUNDID1: one byte, ID of syllable boundary + PAUSEID1: one byte, ID of pause + RESERVE1: reserved for future use + + PHONEPROP2: one byte, max. of 256 phones directly access this table + to check a property for a phone; binary properties + encoded (1 bit per prop) + least significant bit: vowel + next bit: diphth + next bit: glott + next bit: nonsyllvowel + next bit: syllcons + 3 bits spare + */ + +#define KTAB_START_SPECIDS 0 +#define KTAB_IND_PRIMSTRESS 0 +#define KTAB_IND_SECSTRESS 1 +#define KTAB_IND_SYLLBOUND 2 +#define KTAB_IND_PAUSE 3 +#define KTAB_IND_WORDBOUND 4 + +#define KTAB_START_PROPS 8 + + +typedef struct ktabphones_subobj *ktabphones_SubObj; + +typedef struct ktabphones_subobj { + picoos_uint8 *specids; + picoos_uint8 *props; +} ktabphones_subobj_t; + + +/* bitmasks to extract the property info from props */ +#define KTAB_PPROP_VOWEL '\x01' +#define KTAB_PPROP_DIPHTH '\x02' +#define KTAB_PPROP_GLOTT '\x04' +#define KTAB_PPROP_NONSYLLVOWEL '\x08' +#define KTAB_PPROP_SYLLCONS '\x10' + + +static pico_status_t ktabPhonesInitialize(register picoknow_KnowledgeBase this, + picoos_Common common) { + ktabphones_subobj_t * ktabphones; + + PICODBG_DEBUG(("start")); + + if (NULL == this || NULL == this->subObj) { + return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING, + NULL, NULL); + } + ktabphones = (ktabphones_subobj_t *) this->subObj; + ktabphones->specids = &(this->base[KTAB_START_SPECIDS]); + ktabphones->props = &(this->base[KTAB_START_PROPS]); + return PICO_OK; +} + +static pico_status_t ktabPhonesSubObjDeallocate(register picoknow_KnowledgeBase this, + picoos_MemoryManager mm) { + if (NULL != this) { + picoos_deallocate(mm, (void *) &this->subObj); + } + return PICO_OK; +} + +pico_status_t picoktab_specializePhonesKnowledgeBase(picoknow_KnowledgeBase this, + picoos_Common common) { + if (NULL == this) { + return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING, + NULL, NULL); + } + this->subDeallocate = ktabPhonesSubObjDeallocate; + this->subObj = picoos_allocate(common->mm, sizeof(ktabphones_subobj_t)); + if (NULL == this->subObj) { + return picoos_emRaiseException(common->em, PICO_EXC_OUT_OF_MEM, + NULL, NULL); + } + return ktabPhonesInitialize(this, common); +} + +picoktab_Phones picoktab_getPhones(picoknow_KnowledgeBase this) { + if (NULL == this) { + return NULL; + } else { + return (picoktab_Phones) this->subObj; + } +} + + +/* Phones methods */ + +picoos_uint8 picoktab_hasVowelProp(const picoktab_Phones this, + const picoos_uint8 ch) { + return (KTAB_PPROP_VOWEL & ((ktabphones_SubObj)this)->props[ch]); +} +picoos_uint8 picoktab_hasDiphthProp(const picoktab_Phones this, + const picoos_uint8 ch) { + return (KTAB_PPROP_DIPHTH & ((ktabphones_SubObj)this)->props[ch]); +} +picoos_uint8 picoktab_hasGlottProp(const picoktab_Phones this, + const picoos_uint8 ch) { + return (KTAB_PPROP_GLOTT & ((ktabphones_SubObj)this)->props[ch]); +} +picoos_uint8 picoktab_hasNonsyllvowelProp(const picoktab_Phones this, + const picoos_uint8 ch) { + return (KTAB_PPROP_NONSYLLVOWEL & ((ktabphones_SubObj)this)->props[ch]); +} +picoos_uint8 picoktab_hasSyllconsProp(const picoktab_Phones this, + const picoos_uint8 ch) { + return (KTAB_PPROP_SYLLCONS & ((ktabphones_SubObj)this)->props[ch]); +} + +picoos_bool picoktab_isSyllCarrier(const picoktab_Phones this, + const picoos_uint8 ch) { + picoos_uint8 props; + props = ((ktabphones_SubObj)this)->props[ch]; + return (((KTAB_PPROP_VOWEL & props) && + !(KTAB_PPROP_NONSYLLVOWEL & props)) + || (KTAB_PPROP_SYLLCONS & props)); +} + +picoos_bool picoktab_isPrimstress(const picoktab_Phones this, + const picoos_uint8 ch) { + return (ch == ((ktabphones_SubObj)this)->specids[KTAB_IND_PRIMSTRESS]); +} +picoos_bool picoktab_isSecstress(const picoktab_Phones this, + const picoos_uint8 ch) { + return (ch == ((ktabphones_SubObj)this)->specids[KTAB_IND_SECSTRESS]); +} +picoos_bool picoktab_isSyllbound(const picoktab_Phones this, + const picoos_uint8 ch) { + return (ch == ((ktabphones_SubObj)this)->specids[KTAB_IND_SYLLBOUND]); +} +picoos_bool picoktab_isWordbound(const picoktab_Phones this, + const picoos_uint8 ch) { + return (ch == ((ktabphones_SubObj)this)->specids[KTAB_IND_WORDBOUND]); +} +picoos_bool picoktab_isPause(const picoktab_Phones this, + const picoos_uint8 ch) { + return (ch == ((ktabphones_SubObj)this)->specids[KTAB_IND_PAUSE]); +} + +picoos_uint8 picoktab_getPrimstressID(const picoktab_Phones this) { + return ((ktabphones_SubObj)this)->specids[KTAB_IND_PRIMSTRESS]; +} +picoos_uint8 picoktab_getSecstressID(const picoktab_Phones this) { + return ((ktabphones_SubObj)this)->specids[KTAB_IND_SECSTRESS]; +} +picoos_uint8 picoktab_getSyllboundID(const picoktab_Phones this) { + return ((ktabphones_SubObj)this)->specids[KTAB_IND_SYLLBOUND]; +} +picoos_uint8 picoktab_getWordboundID(const picoktab_Phones this) { + return ((ktabphones_SubObj)this)->specids[KTAB_IND_WORDBOUND]; +} +picoos_uint8 picoktab_getPauseID(const picoktab_Phones this) { + return ((ktabphones_SubObj)this)->specids[KTAB_IND_PAUSE]; +} + +/* ************************************************************/ +/* Pos */ +/* ************************************************************/ + +/* overview binary file format for pos kb: + + pos-kb = header posids + header = {COUNT2 OFFS2}=8 + posids = {POSID1 {PARTID1}0:8}1: + + where POSID1 is the value of the (combined) part-of-speech symbol, + and {PARTID1} are the symbol values of its components (empty if it + is not a combined symbol). The {PARTID1} list is sorted. + Part-of-speech symbols with equal number of components are grouped + together. + + The header contains information about these groups: + + COUNT2 specifies the number of elements in the group, and OFFS2 + specifies the offset (relative to the beginning of the kb) where + the group data starts, i.e.: + + 25 32 -> 25 not-combined elements, starting at offset 32 + 44 57 -> 44 elements composed of 2 symbols, starting at offset 57 + 23 189 -> 23 elements composed of 3 symbols, starting at offset 189 + ... + + Currently, each symbol may be composed of up to 8 other symbols. + Therefore, the header has 8 entries, too. The header starts with + the unique POS list, and then in increasing order, 2 symbols, 3 + symbols,... + +Zur Anschauung die ge-printf-te Version: + + 25 32 + 44 57 + 23 189 + 12 281 + 4 341 + 1 365 + 0 0 + 0 0 + 33 | + 34 | + 35 | + 60 | + etc. + 36 | 35 60 + 50 | 35 95 + 51 | 35 97 + 58 | 35 120 + 59 | 35 131 + 61 | 60 75 + 63 | 60 95 + 64 | 60 97 + etc. + 42 | 35 60 117 + 44 | 35 60 131 + 45 | 35 73 97 + 48 | 35 84 97 + 54 | 35 97 131 + 56 | 35 113 120 + 57 | 35 117 120 + 62 | 60 84 122 + etc. + */ + +typedef struct ktabpos_subobj *ktabpos_SubObj; + +typedef struct ktabpos_subobj { + picoos_uint16 nrcomb[PICOKTAB_MAXNRPOS_IN_COMB]; + picoos_uint8 *nrcombstart[PICOKTAB_MAXNRPOS_IN_COMB]; +} ktabpos_subobj_t; + + +static pico_status_t ktabPosInitialize(register picoknow_KnowledgeBase this, + picoos_Common common) { + ktabpos_subobj_t *ktabpos; + picoos_uint16 osprev; + picoos_uint16 os, pos; + picoos_uint8 i; + + PICODBG_DEBUG(("start")); + + if (NULL == this || NULL == this->subObj) { + return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING, + NULL, NULL); + } + ktabpos = (ktabpos_subobj_t *)this->subObj; + + os = 0; + for (i = 0, pos = 0; i < PICOKTAB_MAXNRPOS_IN_COMB; i++, pos += 4) { + ktabpos->nrcomb[i] = ((picoos_uint16)(this->base[pos+1])) << 8 | + this->base[pos]; + if (ktabpos->nrcomb[i] > 0) { + osprev = os; + os = ((picoos_uint16)(this->base[pos+3])) << 8 | this->base[pos+2]; + ktabpos->nrcombstart[i] = &(this->base[os]); + PICODBG_TRACE(("i %d, pos %d, nr %d, osprev %d, os %d", i, pos, + ktabpos->nrcomb[i], osprev, os)); + if (osprev >= os) { + /* cannot be, in a valid kb */ + return picoos_emRaiseException(common->em, + PICO_EXC_FILE_CORRUPT, + NULL, NULL); + } + } else { + if (i == 0) { + /* cannot be, in a valid kb */ + return picoos_emRaiseException(common->em, + PICO_EXC_FILE_CORRUPT, + NULL, NULL); + } + ktabpos->nrcombstart[i] = NULL; + } + } + return PICO_OK; +} + +static pico_status_t ktabPosSubObjDeallocate(register picoknow_KnowledgeBase this, + picoos_MemoryManager mm) { + if (NULL != this) { + picoos_deallocate(mm, (void *) &this->subObj); + } + return PICO_OK; +} + +pico_status_t picoktab_specializePosKnowledgeBase(picoknow_KnowledgeBase this, + picoos_Common common) { + if (NULL == this) { + return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING, + NULL, NULL); + } + this->subDeallocate = ktabPosSubObjDeallocate; + this->subObj = picoos_allocate(common->mm, sizeof(ktabpos_subobj_t)); + if (NULL == this->subObj) { + return picoos_emRaiseException(common->em, PICO_EXC_OUT_OF_MEM, + NULL, NULL); + } + return ktabPosInitialize(this, common); +} + +picoktab_Pos picoktab_getPos(picoknow_KnowledgeBase this) { + if (NULL == this) { + return NULL; + } else { + return (picoktab_Pos) this->subObj; + } +} + + +/* Pos methods */ + +static picoos_int16 ktab_isEqualPosGroup(const picoos_uint8 *grp1, + const picoos_uint8 *grp2, + picoos_uint8 len) +{ + /* if both, grp1 and grp2 would be sorted in ascending order + we could implement a function picoktab_comparePosGroup in + a similar manner as strcmp */ + + picoos_uint16 i, j, equal; + + equal = 1; + + i = 0; + while (equal && (i < len)) { + /* search grp1[i] in grp2 */ + j = 0; + while ((j < len) && (grp1[i] != grp2[j])) { + j++; + } + equal = (j < len); + i++; + } + + return equal; +} + + +picoos_bool picoktab_isUniquePos(const picoktab_Pos this, + const picoos_uint8 pos) { + ktabpos_subobj_t *ktabpos; + picoos_uint16 i; + + /* speed-up possible with e.g. binary search */ + + ktabpos = (ktabpos_subobj_t *)this; + PICODBG_TRACE(("pos %d, nrcombinations %d", pos, ktabpos->nrcomb[0])); + i = 0; + while ((i < ktabpos->nrcomb[0]) && (pos > ktabpos->nrcombstart[0][i])) { + PICODBG_TRACE(("compare with pos %d at position %d", + ktabpos->nrcombstart[0][i], pos, i)); + i++; + } + return ((i < ktabpos->nrcomb[0]) && (pos == ktabpos->nrcombstart[0][i])); +} + + +picoos_bool picoktab_isPartOfPosGroup(const picoktab_Pos this, + const picoos_uint8 pos, + const picoos_uint8 posgroup) +{ + ktabpos_subobj_t *ktabpos; + picoos_uint8 *grp; + picoos_uint16 i, j, n, s, grplen; + picoos_uint8 *e; + picoos_uint8 found; + + ktabpos = (ktabpos_subobj_t *) this; + + grp = NULL; + found = FALSE; + grplen = 0; + + /* currently, a linear search is required to find 'posgroup'; the + knowledge base should be extended to allow for a faster search */ + + /* treat case i==0, grplen==0, ie. pos == posgroup */ + if (pos == posgroup) { + found = TRUE; + } + + i = 1; + while ((grp == NULL) && (i < PICOKTAB_MAXNRPOS_IN_COMB)) { + n = ktabpos->nrcomb[i]; /* number of entries */ + e = ktabpos->nrcombstart[i]; /* ptr to first entry */ + s = i + 2; /* size of an entry in bytes */ + /* was with while starting at 0: + s = i > 0 ? i + 2 : 1; + */ + j = 0; + while ((grp == NULL) && (j < n)) { + if (posgroup == e[0]) { + grp = e + 1; + grplen = s - 1; + } + e += s; + j++; + } + i++; + } + + /* test if 'pos' is contained in the components of 'posgroup' */ + if (grp != NULL) { + for (i = 0; !found && (i < grplen); i++) { + if (pos == grp[i]) { + found = TRUE; + } + } + + /* just a way to test picoktab_getPosGroup */ + /* + PICODBG_ASSERT(picoktab_getPosGroup(this, grp, grplen) == posgroup); + */ + } + + return found; +} + + +picoos_uint8 picoktab_getPosGroup(const picoktab_Pos this, + const picoos_uint8 *poslist, + const picoos_uint8 poslistlen) +{ + picoos_uint8 poscomb; + ktabpos_subobj_t *ktabpos; + picoos_uint16 i, j, n, s; + picoos_uint8 *e; + + ktabpos = (ktabpos_subobj_t *) this; + poscomb = 0; + + if ((poslistlen > 0) && (poslistlen <= PICOKTAB_MAXNRPOS_IN_COMB)) { + i = poslistlen - 1; + if (i > 0) { + n = ktabpos->nrcomb[i]; /* number of entries */ + e = ktabpos->nrcombstart[i]; /* ptr to first entry */ + s = i + 2; /* size of an entry in bytes */ + j = 0; + while (!poscomb && (j < n)) { + if (ktab_isEqualPosGroup(poslist, e + 1, poslistlen)) { + poscomb = *e; + } + e += s; + j++; + } + if (!poscomb) { + /* combination not found; shouldn't occur if lingware OK! */ + /* contingency solution: take first */ + PICODBG_WARN(("dynamically created POS combination not found in table; taking first (%i)",poslist[0])); + poscomb = poslist[0]; + } + } else { /* not a composed POS */ + poscomb = poslist[0]; + } + } + + return poscomb; +} + +#ifdef __cplusplus +} +#endif + + +/* end */ |