summaryrefslogtreecommitdiffstats
path: root/lib/picoktab.c
diff options
context:
space:
mode:
Diffstat (limited to 'lib/picoktab.c')
-rw-r--r--lib/picoktab.c1118
1 files changed, 1118 insertions, 0 deletions
diff --git a/lib/picoktab.c b/lib/picoktab.c
new file mode 100644
index 0000000..ca8b470
--- /dev/null
+++ b/lib/picoktab.c
@@ -0,0 +1,1118 @@
+/*
+ * Copyright (C) 2008-2009 SVOX AG, Baslerstr. 30, 8048 Zuerich, Switzerland
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * @file picoktab.c
+ *
+ * symbol tables needed at runtime
+ *
+ * Copyright (C) 2008-2009 SVOX AG, Baslerstr. 30, 8048 Zuerich, Switzerland
+ * All rights reserved.
+ *
+ * History:
+ * - 2009-04-20 -- initial version
+ *
+ */
+
+#include "picoos.h"
+#include "picodbg.h"
+#include "picoknow.h"
+#include "picobase.h"
+#include "picoktab.h"
+#include "picodata.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#if 0
+}
+#endif
+
+
+/** @todo : the following would be better part of a knowledge base.
+ * Make sure it is consistent with the phoneme symbol table used in the lingware */
+
+/* PLANE_PHONEMES */
+
+/* PLANE_POS */
+
+/* PLANE_PB_STRENGTHS */
+
+/* PLANE_ACCENTS */
+
+/* PLANE_INTERN */
+#define PICOKTAB_TMPID_PHONSTART '\x26' /* 38 '&' */
+#define PICOKTAB_TMPID_PHONTERM '\x23' /* 35 '#' */
+
+
+/* ************************************************************/
+/* fixed ids */
+/* ************************************************************/
+
+
+static pico_status_t ktabIdsInitialize(register picoknow_KnowledgeBase this,
+ picoos_Common common)
+{
+ picoktab_FixedIds ids;
+
+ PICODBG_DEBUG(("start"));
+
+ if (NULL == this || NULL == this->subObj) {
+ return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING,
+ NULL, NULL);
+ }
+ ids = (picoktab_FixedIds) this->subObj;
+
+ ids->phonStartId = PICOKTAB_TMPID_PHONSTART;
+ ids->phonTermId = PICOKTAB_TMPID_PHONTERM;
+ return PICO_OK;
+}
+
+
+static pico_status_t ktabIdsSubObjDeallocate(register picoknow_KnowledgeBase this,
+ picoos_MemoryManager mm)
+{
+ if (NULL != this) {
+ picoos_deallocate(mm, (void *) &this->subObj);
+ }
+ return PICO_OK;
+}
+
+pico_status_t picoktab_specializeIdsKnowledgeBase(picoknow_KnowledgeBase this,
+ picoos_Common common)
+{
+ if (NULL == this) {
+ return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING,
+ NULL, NULL);
+ }
+ this->subDeallocate = ktabIdsSubObjDeallocate;
+ this->subObj = picoos_allocate(common->mm, sizeof(picoktab_fixed_ids_t));
+ if (NULL == this->subObj) {
+ return picoos_emRaiseException(common->em, PICO_EXC_OUT_OF_MEM,
+ NULL, NULL);
+ }
+ return ktabIdsInitialize(this, common);
+}
+
+picoktab_FixedIds picoktab_getFixedIds(picoknow_KnowledgeBase this)
+{
+ return ((NULL == this) ? NULL : ((picoktab_FixedIds) this->subObj));
+}
+
+
+picoktab_FixedIds picoktab_newFixedIds(picoos_MemoryManager mm)
+{
+ picoktab_FixedIds this = (picoktab_FixedIds) picoos_allocate(mm,sizeof(*this));
+ if (NULL != this) {
+ /* initialize */
+ }
+ return this;
+}
+
+
+void picoktab_disposeFixedIds(picoos_MemoryManager mm, picoktab_FixedIds * this)
+{
+ if (NULL != (*this)) {
+ /* terminate */
+ picoos_deallocate(mm,(void *)this);
+ }
+}
+
+
+
+/* ************************************************************/
+/* Graphs */
+/* ************************************************************/
+
+/* overview binary file format for graphs kb:
+
+ graphs-kb = NROFSENTRIES SIZEOFSENTRY ofstable graphs
+
+ NROFSENTRIES : 2 bytes, number of entries in offset table
+ SIZEOFSENTRY : 1 byte, size of one entry in offset table
+
+ ofstable = {OFFSET}=NROFSENTRIES (contains NROFSENTRIES entries of OFFSET)
+
+ OFFSET: SIZEOFSENTRY bytes, offset to baseaddress of graphs-kb to entry in graphs
+
+ graphs = {graph}=NROFSENTRIES (contains NROFSENTRIES entries of graph)
+
+ graph = PROPSET FROM TO [TOKENTYPE] [TOKENSUBTYPE] [VALUE] [LOWERCASE] [GRAPHSUBS1] [GRAPHSUBS2]
+
+ FROM : 1..4 unsigned bytes, UTF8 character without terminating 0
+ TO : 1..4 unsigned bytes, UTF8 character without terminating 0
+ PROPSET : 1 unsigned byte, least significant bit : has TO field
+ next bit : has TOKENTYPE
+ next bit : has TOKENSUBTYPE
+ next bit : has VALUE
+ next bit : has LOWERCASE
+ next bit : has GRAPHSUBS1
+ next bit : has GRAPHSUBS2
+ next bit : has PUNC
+
+ TOKENTYPE : 1 unsigned byte
+ TOKENSUBTYPE : 1 unsigned byte
+ VALUE : 1 unsigned byte
+ LOWERCASE : 1..4 unsigned bytes, UTF8 character without terminating 0
+ GRAPHSUBS1 : 1..4 unsigned bytes, UTF8 character without terminating 0
+ GRAPHSUBS2 : 1..4 unsigned bytes, UTF8 character without terminating 0
+ PUNC : 1 unsigned byte
+*/
+
+static picoos_uint32 ktab_propOffset (const picoktab_Graphs this, picoos_uint32 graphsOffset, picoos_uint32 prop);
+
+#define KTAB_START_GRAPHS_NR_OFFSET 0
+#define KTAB_START_GRAPHS_SIZE_OFFSET 2
+#define KTAB_START_GRAPHS_OFFSET_TABLE 3
+#define KTAB_START_GRAPHS_GRAPH_TABLE 0
+
+/* bitmasks to extract the grapheme properties info from the property set */
+#define KTAB_GRAPH_PROPSET_TO ((picoos_uint8)'\x01')
+#define KTAB_GRAPH_PROPSET_TOKENTYPE ((picoos_uint8)'\x02')
+#define KTAB_GRAPH_PROPSET_TOKENSUBTYPE ((picoos_uint8)'\x04')
+#define KTAB_GRAPH_PROPSET_VALUE ((picoos_uint8)'\x08')
+#define KTAB_GRAPH_PROPSET_LOWERCASE ((picoos_uint8)'\x010')
+#define KTAB_GRAPH_PROPSET_GRAPHSUBS1 ((picoos_uint8)'\x020')
+#define KTAB_GRAPH_PROPSET_GRAPHSUBS2 ((picoos_uint8)'\x040')
+#define KTAB_GRAPH_PROPSET_PUNCT ((picoos_uint8)'\x080')
+
+
+typedef struct ktabgraphs_subobj *ktabgraphs_SubObj;
+
+typedef struct ktabgraphs_subobj {
+ picoos_uint16 nrOffset;
+ picoos_uint16 sizeOffset;
+
+ picoos_uint8 * offsetTable;
+ picoos_uint8 * graphTable;
+} ktabgraphs_subobj_t;
+
+
+
+static pico_status_t ktabGraphsInitialize(register picoknow_KnowledgeBase this,
+ picoos_Common common) {
+ ktabgraphs_subobj_t * ktabgraphs;
+
+ PICODBG_DEBUG(("start"));
+
+ if (NULL == this || NULL == this->subObj) {
+ return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING,
+ NULL, NULL);
+ }
+ ktabgraphs = (ktabgraphs_subobj_t *) this->subObj;
+ ktabgraphs->nrOffset = ((int)(this->base[KTAB_START_GRAPHS_NR_OFFSET])) + 256*(int)(this->base[KTAB_START_GRAPHS_NR_OFFSET+1]);
+ ktabgraphs->sizeOffset = (int)(this->base[KTAB_START_GRAPHS_SIZE_OFFSET]);
+ ktabgraphs->offsetTable = &(this->base[KTAB_START_GRAPHS_OFFSET_TABLE]);
+ ktabgraphs->graphTable = &(this->base[KTAB_START_GRAPHS_GRAPH_TABLE]);
+ return PICO_OK;
+}
+
+static pico_status_t ktabGraphsSubObjDeallocate(register picoknow_KnowledgeBase this,
+ picoos_MemoryManager mm) {
+ if (NULL != this) {
+ picoos_deallocate(mm, (void *) &this->subObj);
+ }
+ return PICO_OK;
+}
+
+
+pico_status_t picoktab_specializeGraphsKnowledgeBase(picoknow_KnowledgeBase this,
+ picoos_Common common) {
+ if (NULL == this) {
+ return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING,
+ NULL, NULL);
+ }
+ this->subDeallocate = ktabGraphsSubObjDeallocate;
+ this->subObj = picoos_allocate(common->mm, sizeof(ktabgraphs_subobj_t));
+ if (NULL == this->subObj) {
+ return picoos_emRaiseException(common->em, PICO_EXC_OUT_OF_MEM,
+ NULL, NULL);
+ }
+ return ktabGraphsInitialize(this, common);
+}
+
+
+picoktab_Graphs picoktab_getGraphs(picoknow_KnowledgeBase this) {
+ if (NULL == this) {
+ return NULL;
+ } else {
+ return (picoktab_Graphs) this->subObj;
+ }
+}
+
+
+/* Graphs methods */
+
+picoos_uint8 picoktab_hasVowellikeProp(const picoktab_Graphs this,
+ const picoos_uint8 *graph,
+ const picoos_uint8 graphlenmax) {
+
+ picoos_uint8 ui8App;
+ picoos_uint32 graphsOffset;
+ ktabgraphs_subobj_t * g = (ktabgraphs_SubObj)this;
+
+ ui8App = graphlenmax; /* avoid warning "var not used in this function"*/
+
+ graphsOffset = picoktab_graphOffset (this, (picoos_uchar *)graph);
+ return g->graphTable[graphsOffset + ktab_propOffset (this, graphsOffset, KTAB_GRAPH_PROPSET_TOKENTYPE)] == PICODATA_ITEMINFO1_TOKTYPE_LETTERV;
+}
+
+
+static void ktab_getStrProp (const picoktab_Graphs this, picoos_uint32 graphsOffset, picoos_uint32 propOffset, picoos_uchar * str)
+{
+ ktabgraphs_subobj_t * g = (ktabgraphs_SubObj)this;
+ picoos_uint32 i, l;
+
+ i = 0;
+ l = picobase_det_utf8_length(g->graphTable[graphsOffset+propOffset]);
+ while (i<l) {
+ str[i] = g->graphTable[graphsOffset+propOffset+i];
+ i++;
+ }
+ str[l] = 0;
+}
+
+
+static picoos_uint32 ktab_propOffset(const picoktab_Graphs this,
+ picoos_uint32 graphsOffset, picoos_uint32 prop)
+/* Returns offset of property 'prop' inside the graph with offset 'graphsOffset' in graphs table;
+ If the property is found, a value > 0 is returned otherwise 0 */
+{
+ picoos_uint32 n = 0;
+ ktabgraphs_subobj_t * g = (ktabgraphs_SubObj) this;
+
+ if ((g->graphTable[graphsOffset] & prop) == prop) {
+ n = n + 1; /* overread PROPSET field */
+ n = n + picobase_det_utf8_length(g->graphTable[graphsOffset+n]); /* overread FROM field */
+ if (prop > KTAB_GRAPH_PROPSET_TO) {
+ if ((g->graphTable[graphsOffset] & KTAB_GRAPH_PROPSET_TO)
+ == KTAB_GRAPH_PROPSET_TO) {
+ n = n + picobase_det_utf8_length(g->graphTable[graphsOffset+n]); /* overread TO field */
+ }
+ } else {
+ return n;
+ }
+ if (prop > KTAB_GRAPH_PROPSET_TOKENTYPE) {
+ if ((g->graphTable[graphsOffset] & KTAB_GRAPH_PROPSET_TOKENTYPE)
+ == KTAB_GRAPH_PROPSET_TOKENTYPE) {
+ n = n + 1; /* overread TOKENTYPE field */
+ }
+ } else {
+ return n;
+ }
+ if (prop > KTAB_GRAPH_PROPSET_TOKENSUBTYPE) {
+ if ((g->graphTable[graphsOffset] & KTAB_GRAPH_PROPSET_TOKENSUBTYPE)
+ == KTAB_GRAPH_PROPSET_TOKENSUBTYPE) {
+ n = n + 1; /* overread stokentype field */
+ }
+ } else {
+ return n;
+ }
+ if (prop > KTAB_GRAPH_PROPSET_VALUE) {
+ if ((g->graphTable[graphsOffset] & KTAB_GRAPH_PROPSET_VALUE)
+ == KTAB_GRAPH_PROPSET_VALUE) {
+ n = n + 1; /* overread value field */
+ }
+ } else {
+ return n;
+ }
+ if (prop > KTAB_GRAPH_PROPSET_LOWERCASE) {
+ if ((g->graphTable[graphsOffset] & KTAB_GRAPH_PROPSET_LOWERCASE)
+ == KTAB_GRAPH_PROPSET_LOWERCASE) {
+ n = n + picobase_det_utf8_length(g->graphTable[graphsOffset+n]); /* overread lowercase field */
+ }
+ } else {
+ return n;
+ }
+ if (prop > KTAB_GRAPH_PROPSET_GRAPHSUBS1) {
+ if ((g->graphTable[graphsOffset] & KTAB_GRAPH_PROPSET_GRAPHSUBS1)
+ == KTAB_GRAPH_PROPSET_GRAPHSUBS1) {
+ n = n + picobase_det_utf8_length(g->graphTable[graphsOffset+n]); /* overread graphsubs1 field */
+ }
+ } else {
+ return n;
+ }
+ if (prop > KTAB_GRAPH_PROPSET_GRAPHSUBS2) {
+ if ((g->graphTable[graphsOffset] & KTAB_GRAPH_PROPSET_GRAPHSUBS2)
+ == KTAB_GRAPH_PROPSET_GRAPHSUBS2) {
+ n = n + picobase_det_utf8_length(g->graphTable[graphsOffset+n]); /* overread graphsubs2 field */
+ }
+ } else {
+ return n;
+ }
+ if (prop > KTAB_GRAPH_PROPSET_PUNCT) {
+ if ((g->graphTable[graphsOffset] & KTAB_GRAPH_PROPSET_PUNCT)
+ == KTAB_GRAPH_PROPSET_PUNCT) {
+ n = n + 1; /* overread value field */
+ }
+ } else {
+ return n;
+ }
+ }
+
+ return n;
+}
+
+
+picoos_uint32 picoktab_graphOffset (const picoktab_Graphs this, picoos_uchar * utf8graph)
+{ ktabgraphs_subobj_t * g = (ktabgraphs_SubObj)this;
+ picoos_int32 a, b, m;
+ picoos_uint32 graphsOffset;
+ picoos_uint32 propOffset;
+ picobase_utf8char from;
+ picobase_utf8char to;
+ picoos_bool utfGEfrom;
+ picoos_bool utfLEto;
+
+ if (g->nrOffset > 0) {
+ a = 0;
+ b = g->nrOffset-1;
+ do {
+ m = (a+b) / 2;
+
+ /* get offset to graph[m] */
+ if (g->sizeOffset == 1) {
+ graphsOffset = g->offsetTable[g->sizeOffset*m];
+ }
+ else {
+ graphsOffset = g->offsetTable[g->sizeOffset*m ] +
+ 256*g->offsetTable[g->sizeOffset*m + 1];
+ /* PICODBG_DEBUG(("picoktab_graphOffset: %i %i %i %i", m, g->offsetTable[g->sizeOffset*m], g->offsetTable[g->sizeOffset*m + 1], graphsOffset));
+ */
+ }
+
+ /* get FROM and TO field of graph[m] */
+ ktab_getStrProp(this, graphsOffset, 1, from);
+ propOffset = ktab_propOffset(this, graphsOffset, KTAB_GRAPH_PROPSET_TO);
+ if (propOffset > 0) {
+ ktab_getStrProp(this, graphsOffset, propOffset, to);
+ }
+ else {
+ picoos_strcpy((picoos_char *)to, (picoos_char *)from);
+ }
+
+ /* PICODBG_DEBUG(("picoktab_graphOffset: %i %i %i '%s' '%s' '%s'", a, m, b, from, utf8graph, to));
+ */
+ utfGEfrom = picoos_strcmp((picoos_char *)utf8graph, (picoos_char *)from) >= 0;
+ utfLEto = picoos_strcmp((picoos_char *)utf8graph, (picoos_char *)to) <= 0;
+
+ if (utfGEfrom && utfLEto) {
+ /* PICODBG_DEBUG(("picoktab_graphOffset: utf char '%s' found", utf8graph));
+ */
+ return graphsOffset;
+ }
+ if (!utfGEfrom) {
+ b = m-1;
+ }
+ else if (!utfLEto) {
+ a = m+1;
+ }
+ } while (a<=b);
+ }
+ PICODBG_DEBUG(("picoktab_graphOffset: utf char '%s' not found", utf8graph));
+ return 0;
+}
+
+
+
+
+picoos_bool picoktab_getIntPropTokenType (const picoktab_Graphs this, picoos_uint32 graphsOffset, picoos_uint8 * stokenType)
+{
+ picoos_uint32 propOffset;
+ ktabgraphs_subobj_t * g = (ktabgraphs_SubObj)this;
+
+ propOffset = ktab_propOffset(this, graphsOffset, KTAB_GRAPH_PROPSET_TOKENTYPE);
+ if (propOffset > 0) {
+ *stokenType = (picoos_uint8)(g->graphTable[graphsOffset+propOffset]);
+ return TRUE;
+ }
+ else {
+ return FALSE;
+ }
+}
+
+
+picoos_bool picoktab_getIntPropTokenSubType (const picoktab_Graphs this, picoos_uint32 graphsOffset, picoos_int8 * stokenSubType)
+{
+ picoos_uint32 propOffset;
+ ktabgraphs_subobj_t * g = (ktabgraphs_SubObj)this;
+
+ propOffset = ktab_propOffset(this, graphsOffset, KTAB_GRAPH_PROPSET_TOKENSUBTYPE);
+ if (propOffset > 0) {
+ *stokenSubType = (picoos_int8)(g->graphTable[graphsOffset+propOffset]);
+ return TRUE;
+ }
+ else {
+ return FALSE;
+ }
+}
+
+picoos_bool picoktab_getIntPropValue (const picoktab_Graphs this, picoos_uint32 graphsOffset, picoos_uint32 * value)
+{
+ picoos_uint32 propOffset;
+ ktabgraphs_subobj_t * g = (ktabgraphs_SubObj)this;
+
+ propOffset = ktab_propOffset(this, graphsOffset, KTAB_GRAPH_PROPSET_VALUE);
+ if (propOffset > 0) {
+ *value = (picoos_uint32)(g->graphTable[graphsOffset+propOffset]);
+ return TRUE;
+ }
+ else {
+ return FALSE;
+ }
+}
+
+
+picoos_bool picoktab_getIntPropPunct (const picoktab_Graphs this, picoos_uint32 graphsOffset, picoos_uint8 * info1, picoos_uint8 * info2)
+{
+ picoos_uint32 propOffset;
+ ktabgraphs_subobj_t * g = (ktabgraphs_SubObj)this;
+
+ propOffset = ktab_propOffset(this, graphsOffset, KTAB_GRAPH_PROPSET_PUNCT);
+ if (propOffset > 0) {
+ if (g->graphTable[graphsOffset+propOffset] == 2) {
+ *info1 = PICODATA_ITEMINFO1_PUNC_SENTEND;
+ }
+ else {
+ *info1 = PICODATA_ITEMINFO1_PUNC_PHRASEEND;
+ }
+ if (g->graphTable[graphsOffset+1] == '.') {
+ *info2 = PICODATA_ITEMINFO2_PUNC_SENT_T;
+ }
+ else if (g->graphTable[graphsOffset+1] == '?') {
+ *info2 = PICODATA_ITEMINFO2_PUNC_SENT_Q;
+ }
+ else if (g->graphTable[graphsOffset+1] == '!') {
+ *info2 = PICODATA_ITEMINFO2_PUNC_SENT_E;
+ }
+ else {
+ *info2 = PICODATA_ITEMINFO2_PUNC_PHRASE;
+ }
+ return TRUE;
+ }
+ else {
+ return FALSE;
+ }
+}
+
+
+picoos_bool picoktab_getStrPropLowercase (const picoktab_Graphs this, picoos_uint32 graphsOffset, picoos_uchar * lowercase)
+{
+ picoos_uint32 propOffset;
+
+ propOffset = ktab_propOffset(this, graphsOffset, KTAB_GRAPH_PROPSET_LOWERCASE);
+ if (propOffset > 0) {
+ ktab_getStrProp(this, graphsOffset, propOffset, lowercase);
+ return TRUE;
+ }
+ else {
+ return FALSE;
+ }
+}
+
+
+picoos_bool picoktab_getStrPropGraphsubs1 (const picoktab_Graphs this, picoos_uint32 graphsOffset, picoos_uchar * graphsubs1)
+{
+ picoos_uint32 propOffset;
+
+ propOffset = ktab_propOffset(this, graphsOffset, KTAB_GRAPH_PROPSET_GRAPHSUBS1);
+ if (propOffset > 0) {
+ ktab_getStrProp(this, graphsOffset, propOffset, graphsubs1);
+ return TRUE;
+ }
+ else {
+ return FALSE;
+ }
+}
+
+
+picoos_bool picoktab_getStrPropGraphsubs2 (const picoktab_Graphs this, picoos_uint32 graphsOffset, picoos_uchar * graphsubs2)
+{
+ picoos_uint32 propOffset;
+
+ propOffset = ktab_propOffset(this, graphsOffset, KTAB_GRAPH_PROPSET_GRAPHSUBS2);
+ if (propOffset > 0) {
+ ktab_getStrProp(this, graphsOffset, propOffset, graphsubs2);
+ return TRUE;
+ }
+ else {
+ return FALSE;
+ }
+}
+/* *****************************************************************/
+/* used for tools */
+
+static void ktab_getUtf8 (picoos_uchar ** pos, picoos_uchar * to)
+{
+ picoos_uint32 l;
+ l = picobase_det_utf8_length(**pos);
+ while (l>0) {
+ *(to++) = *((*pos)++);
+ l--;
+ }
+ *to = 0;
+}
+
+picoos_uint16 picoktab_graphsGetNumEntries(const picoktab_Graphs this)
+{
+ ktabgraphs_subobj_t * g = (ktabgraphs_SubObj) this;
+ return g->nrOffset;
+}
+
+void picoktab_graphsGetGraphInfo(const picoktab_Graphs this,
+ picoos_uint16 graphIndex, picoos_uchar * from, picoos_uchar * to,
+ picoos_uint8 * propset,
+ picoos_uint8 * stokenType, picoos_uint8 * stokenSubType,
+ picoos_uint8 * value, picoos_uchar * lowercase,
+ picoos_uchar * graphsubs1, picoos_uchar * graphsubs2,
+ picoos_uint8 * punct) {
+ ktabgraphs_subobj_t * g = (ktabgraphs_SubObj) this;
+ picoos_uint32 graphsOffset;
+ picoos_uint8 * pos;
+
+ /* calculate offset of graph[graphIndex] */
+ if (g->sizeOffset == 1) {
+ graphsOffset = g->offsetTable[graphIndex];
+ } else {
+ graphsOffset = g->offsetTable[2 * graphIndex]
+ + (g->offsetTable[2 * graphIndex + 1] << 8);
+ }
+ pos = &(g->graphTable[graphsOffset]);
+ *propset = *pos;
+
+ pos++; /* advance to FROM */
+ ktab_getUtf8(&pos, from); /* get FROM and advance */
+ if ((*propset) & KTAB_GRAPH_PROPSET_TO) {
+ ktab_getUtf8(&pos, to); /* get TO and advance */
+ } else {
+ picoos_strcpy((picoos_char *)to, (picoos_char *)from);
+ }
+ if ((*propset) & KTAB_GRAPH_PROPSET_TOKENTYPE) {
+ (*stokenType) = *(pos++); /* get TOKENTYPE and advance */
+ } else {
+ (*stokenType) = -1;
+ }
+ if ((*propset) & KTAB_GRAPH_PROPSET_TOKENSUBTYPE) {
+ (*stokenSubType) = *(pos++); /* get TOKENSUBTYPE and advance */
+ } else {
+ (*stokenSubType) = -1;
+ }
+ if ((*propset) & KTAB_GRAPH_PROPSET_VALUE) {
+ (*value) = *(pos++); /* get VALUE and advance */
+ } else {
+ (*value) = -1;
+ }
+ if ((*propset) & KTAB_GRAPH_PROPSET_LOWERCASE) {
+ ktab_getUtf8(&pos, lowercase); /* get LOWERCASE and advance */
+ } else {
+ lowercase[0] = NULLC;
+ }
+ if ((*propset) & KTAB_GRAPH_PROPSET_GRAPHSUBS1) {
+ ktab_getUtf8(&pos, graphsubs1); /* get GRAPHSUBS1 and advance */
+ } else {
+ graphsubs1[0] = NULLC;
+ }
+ if ((*propset) & KTAB_GRAPH_PROPSET_GRAPHSUBS2) {
+ ktab_getUtf8(&pos, graphsubs2); /* get GRAPHSUBS2 and advance */
+ } else {
+ graphsubs2[0] = NULLC;
+ }
+ if ((*propset) & KTAB_GRAPH_PROPSET_PUNCT) {
+ (*punct) = *(pos++); /* get PUNCT and advance */
+ } else {
+ (*punct) = -1;
+ }
+}
+
+/* ************************************************************/
+/* Phones */
+/* ************************************************************/
+
+/* overview binary file format for phones kb:
+
+ phones-kb = specids propertytable
+
+ specids = PRIMSTRESSID1 SECSTRESSID1 SYLLBOUNDID1 PAUSEID1 WORDBOUNDID1
+ RESERVE1 RESERVE1 RESERVE1
+
+ propertytable = {PHONEPROP2}=256
+
+ PRIMSTRESSID1: one byte, ID of primary stress
+ SECSTRESSID1: one byte, ID of secondary stress
+ SYLLBOUNDID1: one byte, ID of syllable boundary
+ PAUSEID1: one byte, ID of pause
+ RESERVE1: reserved for future use
+
+ PHONEPROP2: one byte, max. of 256 phones directly access this table
+ to check a property for a phone; binary properties
+ encoded (1 bit per prop)
+ least significant bit: vowel
+ next bit: diphth
+ next bit: glott
+ next bit: nonsyllvowel
+ next bit: syllcons
+ 3 bits spare
+ */
+
+#define KTAB_START_SPECIDS 0
+#define KTAB_IND_PRIMSTRESS 0
+#define KTAB_IND_SECSTRESS 1
+#define KTAB_IND_SYLLBOUND 2
+#define KTAB_IND_PAUSE 3
+#define KTAB_IND_WORDBOUND 4
+
+#define KTAB_START_PROPS 8
+
+
+typedef struct ktabphones_subobj *ktabphones_SubObj;
+
+typedef struct ktabphones_subobj {
+ picoos_uint8 *specids;
+ picoos_uint8 *props;
+} ktabphones_subobj_t;
+
+
+/* bitmasks to extract the property info from props */
+#define KTAB_PPROP_VOWEL '\x01'
+#define KTAB_PPROP_DIPHTH '\x02'
+#define KTAB_PPROP_GLOTT '\x04'
+#define KTAB_PPROP_NONSYLLVOWEL '\x08'
+#define KTAB_PPROP_SYLLCONS '\x10'
+
+
+static pico_status_t ktabPhonesInitialize(register picoknow_KnowledgeBase this,
+ picoos_Common common) {
+ ktabphones_subobj_t * ktabphones;
+
+ PICODBG_DEBUG(("start"));
+
+ if (NULL == this || NULL == this->subObj) {
+ return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING,
+ NULL, NULL);
+ }
+ ktabphones = (ktabphones_subobj_t *) this->subObj;
+ ktabphones->specids = &(this->base[KTAB_START_SPECIDS]);
+ ktabphones->props = &(this->base[KTAB_START_PROPS]);
+ return PICO_OK;
+}
+
+static pico_status_t ktabPhonesSubObjDeallocate(register picoknow_KnowledgeBase this,
+ picoos_MemoryManager mm) {
+ if (NULL != this) {
+ picoos_deallocate(mm, (void *) &this->subObj);
+ }
+ return PICO_OK;
+}
+
+pico_status_t picoktab_specializePhonesKnowledgeBase(picoknow_KnowledgeBase this,
+ picoos_Common common) {
+ if (NULL == this) {
+ return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING,
+ NULL, NULL);
+ }
+ this->subDeallocate = ktabPhonesSubObjDeallocate;
+ this->subObj = picoos_allocate(common->mm, sizeof(ktabphones_subobj_t));
+ if (NULL == this->subObj) {
+ return picoos_emRaiseException(common->em, PICO_EXC_OUT_OF_MEM,
+ NULL, NULL);
+ }
+ return ktabPhonesInitialize(this, common);
+}
+
+picoktab_Phones picoktab_getPhones(picoknow_KnowledgeBase this) {
+ if (NULL == this) {
+ return NULL;
+ } else {
+ return (picoktab_Phones) this->subObj;
+ }
+}
+
+
+/* Phones methods */
+
+picoos_uint8 picoktab_hasVowelProp(const picoktab_Phones this,
+ const picoos_uint8 ch) {
+ return (KTAB_PPROP_VOWEL & ((ktabphones_SubObj)this)->props[ch]);
+}
+picoos_uint8 picoktab_hasDiphthProp(const picoktab_Phones this,
+ const picoos_uint8 ch) {
+ return (KTAB_PPROP_DIPHTH & ((ktabphones_SubObj)this)->props[ch]);
+}
+picoos_uint8 picoktab_hasGlottProp(const picoktab_Phones this,
+ const picoos_uint8 ch) {
+ return (KTAB_PPROP_GLOTT & ((ktabphones_SubObj)this)->props[ch]);
+}
+picoos_uint8 picoktab_hasNonsyllvowelProp(const picoktab_Phones this,
+ const picoos_uint8 ch) {
+ return (KTAB_PPROP_NONSYLLVOWEL & ((ktabphones_SubObj)this)->props[ch]);
+}
+picoos_uint8 picoktab_hasSyllconsProp(const picoktab_Phones this,
+ const picoos_uint8 ch) {
+ return (KTAB_PPROP_SYLLCONS & ((ktabphones_SubObj)this)->props[ch]);
+}
+
+picoos_bool picoktab_isSyllCarrier(const picoktab_Phones this,
+ const picoos_uint8 ch) {
+ picoos_uint8 props;
+ props = ((ktabphones_SubObj)this)->props[ch];
+ return (((KTAB_PPROP_VOWEL & props) &&
+ !(KTAB_PPROP_NONSYLLVOWEL & props))
+ || (KTAB_PPROP_SYLLCONS & props));
+}
+
+picoos_bool picoktab_isPrimstress(const picoktab_Phones this,
+ const picoos_uint8 ch) {
+ return (ch == ((ktabphones_SubObj)this)->specids[KTAB_IND_PRIMSTRESS]);
+}
+picoos_bool picoktab_isSecstress(const picoktab_Phones this,
+ const picoos_uint8 ch) {
+ return (ch == ((ktabphones_SubObj)this)->specids[KTAB_IND_SECSTRESS]);
+}
+picoos_bool picoktab_isSyllbound(const picoktab_Phones this,
+ const picoos_uint8 ch) {
+ return (ch == ((ktabphones_SubObj)this)->specids[KTAB_IND_SYLLBOUND]);
+}
+picoos_bool picoktab_isWordbound(const picoktab_Phones this,
+ const picoos_uint8 ch) {
+ return (ch == ((ktabphones_SubObj)this)->specids[KTAB_IND_WORDBOUND]);
+}
+picoos_bool picoktab_isPause(const picoktab_Phones this,
+ const picoos_uint8 ch) {
+ return (ch == ((ktabphones_SubObj)this)->specids[KTAB_IND_PAUSE]);
+}
+
+picoos_uint8 picoktab_getPrimstressID(const picoktab_Phones this) {
+ return ((ktabphones_SubObj)this)->specids[KTAB_IND_PRIMSTRESS];
+}
+picoos_uint8 picoktab_getSecstressID(const picoktab_Phones this) {
+ return ((ktabphones_SubObj)this)->specids[KTAB_IND_SECSTRESS];
+}
+picoos_uint8 picoktab_getSyllboundID(const picoktab_Phones this) {
+ return ((ktabphones_SubObj)this)->specids[KTAB_IND_SYLLBOUND];
+}
+picoos_uint8 picoktab_getWordboundID(const picoktab_Phones this) {
+ return ((ktabphones_SubObj)this)->specids[KTAB_IND_WORDBOUND];
+}
+picoos_uint8 picoktab_getPauseID(const picoktab_Phones this) {
+ return ((ktabphones_SubObj)this)->specids[KTAB_IND_PAUSE];
+}
+
+/* ************************************************************/
+/* Pos */
+/* ************************************************************/
+
+/* overview binary file format for pos kb:
+
+ pos-kb = header posids
+ header = {COUNT2 OFFS2}=8
+ posids = {POSID1 {PARTID1}0:8}1:
+
+ where POSID1 is the value of the (combined) part-of-speech symbol,
+ and {PARTID1} are the symbol values of its components (empty if it
+ is not a combined symbol). The {PARTID1} list is sorted.
+ Part-of-speech symbols with equal number of components are grouped
+ together.
+
+ The header contains information about these groups:
+
+ COUNT2 specifies the number of elements in the group, and OFFS2
+ specifies the offset (relative to the beginning of the kb) where
+ the group data starts, i.e.:
+
+ 25 32 -> 25 not-combined elements, starting at offset 32
+ 44 57 -> 44 elements composed of 2 symbols, starting at offset 57
+ 23 189 -> 23 elements composed of 3 symbols, starting at offset 189
+ ...
+
+ Currently, each symbol may be composed of up to 8 other symbols.
+ Therefore, the header has 8 entries, too. The header starts with
+ the unique POS list, and then in increasing order, 2 symbols, 3
+ symbols,...
+
+Zur Anschauung die ge-printf-te Version:
+
+ 25 32
+ 44 57
+ 23 189
+ 12 281
+ 4 341
+ 1 365
+ 0 0
+ 0 0
+ 33 |
+ 34 |
+ 35 |
+ 60 |
+ etc.
+ 36 | 35 60
+ 50 | 35 95
+ 51 | 35 97
+ 58 | 35 120
+ 59 | 35 131
+ 61 | 60 75
+ 63 | 60 95
+ 64 | 60 97
+ etc.
+ 42 | 35 60 117
+ 44 | 35 60 131
+ 45 | 35 73 97
+ 48 | 35 84 97
+ 54 | 35 97 131
+ 56 | 35 113 120
+ 57 | 35 117 120
+ 62 | 60 84 122
+ etc.
+ */
+
+typedef struct ktabpos_subobj *ktabpos_SubObj;
+
+typedef struct ktabpos_subobj {
+ picoos_uint16 nrcomb[PICOKTAB_MAXNRPOS_IN_COMB];
+ picoos_uint8 *nrcombstart[PICOKTAB_MAXNRPOS_IN_COMB];
+} ktabpos_subobj_t;
+
+
+static pico_status_t ktabPosInitialize(register picoknow_KnowledgeBase this,
+ picoos_Common common) {
+ ktabpos_subobj_t *ktabpos;
+ picoos_uint16 osprev;
+ picoos_uint16 os, pos;
+ picoos_uint8 i;
+
+ PICODBG_DEBUG(("start"));
+
+ if (NULL == this || NULL == this->subObj) {
+ return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING,
+ NULL, NULL);
+ }
+ ktabpos = (ktabpos_subobj_t *)this->subObj;
+
+ os = 0;
+ for (i = 0, pos = 0; i < PICOKTAB_MAXNRPOS_IN_COMB; i++, pos += 4) {
+ ktabpos->nrcomb[i] = ((picoos_uint16)(this->base[pos+1])) << 8 |
+ this->base[pos];
+ if (ktabpos->nrcomb[i] > 0) {
+ osprev = os;
+ os = ((picoos_uint16)(this->base[pos+3])) << 8 | this->base[pos+2];
+ ktabpos->nrcombstart[i] = &(this->base[os]);
+ PICODBG_TRACE(("i %d, pos %d, nr %d, osprev %d, os %d", i, pos,
+ ktabpos->nrcomb[i], osprev, os));
+ if (osprev >= os) {
+ /* cannot be, in a valid kb */
+ return picoos_emRaiseException(common->em,
+ PICO_EXC_FILE_CORRUPT,
+ NULL, NULL);
+ }
+ } else {
+ if (i == 0) {
+ /* cannot be, in a valid kb */
+ return picoos_emRaiseException(common->em,
+ PICO_EXC_FILE_CORRUPT,
+ NULL, NULL);
+ }
+ ktabpos->nrcombstart[i] = NULL;
+ }
+ }
+ return PICO_OK;
+}
+
+static pico_status_t ktabPosSubObjDeallocate(register picoknow_KnowledgeBase this,
+ picoos_MemoryManager mm) {
+ if (NULL != this) {
+ picoos_deallocate(mm, (void *) &this->subObj);
+ }
+ return PICO_OK;
+}
+
+pico_status_t picoktab_specializePosKnowledgeBase(picoknow_KnowledgeBase this,
+ picoos_Common common) {
+ if (NULL == this) {
+ return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING,
+ NULL, NULL);
+ }
+ this->subDeallocate = ktabPosSubObjDeallocate;
+ this->subObj = picoos_allocate(common->mm, sizeof(ktabpos_subobj_t));
+ if (NULL == this->subObj) {
+ return picoos_emRaiseException(common->em, PICO_EXC_OUT_OF_MEM,
+ NULL, NULL);
+ }
+ return ktabPosInitialize(this, common);
+}
+
+picoktab_Pos picoktab_getPos(picoknow_KnowledgeBase this) {
+ if (NULL == this) {
+ return NULL;
+ } else {
+ return (picoktab_Pos) this->subObj;
+ }
+}
+
+
+/* Pos methods */
+
+static picoos_int16 ktab_isEqualPosGroup(const picoos_uint8 *grp1,
+ const picoos_uint8 *grp2,
+ picoos_uint8 len)
+{
+ /* if both, grp1 and grp2 would be sorted in ascending order
+ we could implement a function picoktab_comparePosGroup in
+ a similar manner as strcmp */
+
+ picoos_uint16 i, j, equal;
+
+ equal = 1;
+
+ i = 0;
+ while (equal && (i < len)) {
+ /* search grp1[i] in grp2 */
+ j = 0;
+ while ((j < len) && (grp1[i] != grp2[j])) {
+ j++;
+ }
+ equal = (j < len);
+ i++;
+ }
+
+ return equal;
+}
+
+
+picoos_bool picoktab_isUniquePos(const picoktab_Pos this,
+ const picoos_uint8 pos) {
+ ktabpos_subobj_t *ktabpos;
+ picoos_uint16 i;
+
+ /* speed-up possible with e.g. binary search */
+
+ ktabpos = (ktabpos_subobj_t *)this;
+ PICODBG_TRACE(("pos %d, nrcombinations %d", pos, ktabpos->nrcomb[0]));
+ i = 0;
+ while ((i < ktabpos->nrcomb[0]) && (pos > ktabpos->nrcombstart[0][i])) {
+ PICODBG_TRACE(("compare with pos %d at position %d",
+ ktabpos->nrcombstart[0][i], pos, i));
+ i++;
+ }
+ return ((i < ktabpos->nrcomb[0]) && (pos == ktabpos->nrcombstart[0][i]));
+}
+
+
+picoos_bool picoktab_isPartOfPosGroup(const picoktab_Pos this,
+ const picoos_uint8 pos,
+ const picoos_uint8 posgroup)
+{
+ ktabpos_subobj_t *ktabpos;
+ picoos_uint8 *grp;
+ picoos_uint16 i, j, n, s, grplen;
+ picoos_uint8 *e;
+ picoos_uint8 found;
+
+ ktabpos = (ktabpos_subobj_t *) this;
+
+ grp = NULL;
+ found = FALSE;
+ grplen = 0;
+
+ /* currently, a linear search is required to find 'posgroup'; the
+ knowledge base should be extended to allow for a faster search */
+
+ /* treat case i==0, grplen==0, ie. pos == posgroup */
+ if (pos == posgroup) {
+ found = TRUE;
+ }
+
+ i = 1;
+ while ((grp == NULL) && (i < PICOKTAB_MAXNRPOS_IN_COMB)) {
+ n = ktabpos->nrcomb[i]; /* number of entries */
+ e = ktabpos->nrcombstart[i]; /* ptr to first entry */
+ s = i + 2; /* size of an entry in bytes */
+ /* was with while starting at 0:
+ s = i > 0 ? i + 2 : 1;
+ */
+ j = 0;
+ while ((grp == NULL) && (j < n)) {
+ if (posgroup == e[0]) {
+ grp = e + 1;
+ grplen = s - 1;
+ }
+ e += s;
+ j++;
+ }
+ i++;
+ }
+
+ /* test if 'pos' is contained in the components of 'posgroup' */
+ if (grp != NULL) {
+ for (i = 0; !found && (i < grplen); i++) {
+ if (pos == grp[i]) {
+ found = TRUE;
+ }
+ }
+
+ /* just a way to test picoktab_getPosGroup */
+ /*
+ PICODBG_ASSERT(picoktab_getPosGroup(this, grp, grplen) == posgroup);
+ */
+ }
+
+ return found;
+}
+
+
+picoos_uint8 picoktab_getPosGroup(const picoktab_Pos this,
+ const picoos_uint8 *poslist,
+ const picoos_uint8 poslistlen)
+{
+ picoos_uint8 poscomb;
+ ktabpos_subobj_t *ktabpos;
+ picoos_uint16 i, j, n, s;
+ picoos_uint8 *e;
+
+ ktabpos = (ktabpos_subobj_t *) this;
+ poscomb = 0;
+
+ if ((poslistlen > 0) && (poslistlen <= PICOKTAB_MAXNRPOS_IN_COMB)) {
+ i = poslistlen - 1;
+ if (i > 0) {
+ n = ktabpos->nrcomb[i]; /* number of entries */
+ e = ktabpos->nrcombstart[i]; /* ptr to first entry */
+ s = i + 2; /* size of an entry in bytes */
+ j = 0;
+ while (!poscomb && (j < n)) {
+ if (ktab_isEqualPosGroup(poslist, e + 1, poslistlen)) {
+ poscomb = *e;
+ }
+ e += s;
+ j++;
+ }
+ if (!poscomb) {
+ /* combination not found; shouldn't occur if lingware OK! */
+ /* contingency solution: take first */
+ PICODBG_WARN(("dynamically created POS combination not found in table; taking first (%i)",poslist[0]));
+ poscomb = poslist[0];
+ }
+ } else { /* not a composed POS */
+ poscomb = poslist[0];
+ }
+ }
+
+ return poscomb;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+
+/* end */