diff options
Diffstat (limited to 'lib/picokfst.c')
-rw-r--r-- | lib/picokfst.c | 438 |
1 files changed, 438 insertions, 0 deletions
diff --git a/lib/picokfst.c b/lib/picokfst.c new file mode 100644 index 0000000..560709c --- /dev/null +++ b/lib/picokfst.c @@ -0,0 +1,438 @@ +/* + * Copyright (C) 2008-2009 SVOX AG, Baslerstr. 30, 8048 Zuerich, Switzerland + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * @file picokfst.c + * + * FST knowledge loading and access + * + * Copyright (C) 2008-2009 SVOX AG, Baslerstr. 30, 8048 Zuerich, Switzerland + * All rights reserved. + * + * History: + * - 2009-04-20 -- initial version + * + */ +#include "picoos.h" +#include "picodbg.h" +#include "picoknow.h" +#include "picokfst.h" + +#ifdef __cplusplus +extern "C" { +#endif +#if 0 +} +#endif + + +#define FileHdrSize 4 /* size of FST file header */ + + + +/* ************************************************************/ +/* function to create specialized kb, */ +/* to be used by picorsrc only */ +/* ************************************************************/ + +/** object : FSTKnowledgeBase + * shortcut : kfst + * derived from : picoknow_KnowledgeBase + */ + +typedef struct kfst_subobj * kfst_SubObj; + +typedef struct kfst_subobj{ + picoos_uint8 * fstStream; /* the byte stream base address */ + picoos_int32 hdrLen; /* length of file header */ + picoos_int32 transductionMode; /* transduction mode to be used for FST */ + picoos_int32 nrClasses; /* nr of pair/transition classes in FST; class is in [1..nrClasses] */ + picoos_int32 nrStates; /* nr of states in FST; state is in [1..nrState] */ + picoos_int32 termClass; /* pair class of terminator symbol pair; probably obsolete */ + picoos_int32 alphaHashTabSize; /* size of pair alphabet hash table */ + picoos_int32 alphaHashTabPos; /* absolute address of the start of the pair alphabet */ + picoos_int32 transTabEntrySize; /* size in bytes of each transition table entry */ + picoos_int32 transTabPos; /* absolute address of the start of the transition table */ + picoos_int32 inEpsStateTabPos; /* absolute address of the start of the input epsilon transition table */ + picoos_int32 accStateTabPos; /* absolute address of the table of accepting states */ +} kfst_subobj_t; + + + +/* ************************************************************/ +/* primitives for reading from byte stream */ +/* ************************************************************/ + +/* Converts 'nrBytes' bytes starting at position '*pos' in byte stream 'stream' into unsigned number 'num'. + '*pos' is modified to the position right after the number */ +static void FixedBytesToUnsignedNum (picoos_uint8 * stream, picoos_uint8 nrBytes, picoos_uint32 * pos, picoos_uint32 * num) +{ + picoos_int32 i; + + (*num) = 0; + for (i = 0; i < nrBytes; i++) { + (*num) = ((*num) << 8) + (picoos_uint32)stream[*pos]; + (*pos)++; + } +} + + +/* Converts 'nrBytes' bytes starting at position '*pos' in byte stream 'stream' into signed number 'num'. + '*pos' is modified to the position right after the number */ +static void FixedBytesToSignedNum (picoos_uint8 * stream, picoos_uint8 nrBytes, picoos_uint32 * pos, picoos_int32 * num) +{ + picoos_int32 i; + picoos_uint32 val; + + val = 0; + for (i = 0; i < nrBytes; i++) { + val = (val << 8) + (picoos_uint32)stream[*pos]; + (*pos)++; + } + if (val % 2 == 1) { + /* negative number */ + (*num) = -((picoos_int32)((val - 1) / 2)) - 1; + } else { + /* positive number */ + (*num) = val / 2; + } +} + + +/* Converts varying-sized sequence of bytes starting at position '*pos' in byte stream 'stream' + into (signed) number 'num'. '*pos' is modified to the position right after the number. */ +static void BytesToNum (picoos_uint8 * stream, picoos_uint32 * pos, picoos_int32 * num) +{ + picoos_uint32 val; + picoos_uint32 b; + + val = 0; + b = (picoos_uint32)stream[*pos]; + (*pos)++; + while (b < 128) { + val = (val << 7) + b; + b = (picoos_uint32)stream[*pos]; + (*pos)++; + } + val = (val << 7) + (b - 128); + if (val % 2 == 1) { + /* negative number */ + (*num) = -((picoos_int32)((val - 1) / 2)) - 1; + } else { + /* positive number */ + (*num) = val / 2; + } +} + + +/* ************************************************************/ +/* setting up FST from byte stream */ +/* ************************************************************/ + +static pico_status_t kfstInitialize(register picoknow_KnowledgeBase this, + picoos_Common common) +{ + picoos_uint32 curpos; + picoos_int32 offs; + kfst_subobj_t * kfst; + + PICODBG_DEBUG(("kfstInitialize -- start\n")); + + if (NULL == this || NULL == this->subObj) { + return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING, NULL, + NULL); + } + kfst = (kfst_subobj_t *) this->subObj; + + /* +CT+ */ + kfst->fstStream = this->base; + PICODBG_TRACE(("base: %d\n",this->base)); + kfst->hdrLen = FileHdrSize; + curpos = kfst->hdrLen; + BytesToNum(kfst->fstStream,& curpos,& kfst->transductionMode); + BytesToNum(kfst->fstStream,& curpos,& kfst->nrClasses); + BytesToNum(kfst->fstStream,& curpos,& kfst->nrStates); + BytesToNum(kfst->fstStream,& curpos,& kfst->termClass); + BytesToNum(kfst->fstStream,& curpos,& kfst->alphaHashTabSize); + BytesToNum(kfst->fstStream,& curpos,& offs); + kfst->alphaHashTabPos = kfst->hdrLen + offs; + BytesToNum(kfst->fstStream,& curpos,& kfst->transTabEntrySize); + BytesToNum(kfst->fstStream,& curpos,& offs); + kfst->transTabPos = kfst->hdrLen + offs; + BytesToNum(kfst->fstStream,& curpos,& offs); + kfst->inEpsStateTabPos = kfst->hdrLen + offs; + BytesToNum(kfst->fstStream,& curpos,& offs); + kfst->accStateTabPos = kfst->hdrLen + offs; + /* -CT- */ + + return PICO_OK; +} + + +static pico_status_t kfstSubObjDeallocate(register picoknow_KnowledgeBase this, + picoos_MemoryManager mm) +{ + if (NULL != this) { + picoos_deallocate(mm, (void *) &this->subObj); + } + return PICO_OK; +} + + +/* calculates a small number of data (e.g. addresses) from kb for fast access. + * This data is encapsulated in a picokfst_FST that can later be retrieved + * with picokfst_getFST. */ +pico_status_t picokfst_specializeFSTKnowledgeBase(picoknow_KnowledgeBase this, + picoos_Common common) +{ + pico_status_t status; + + if (NULL == this) { + return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING, NULL, NULL); + } + if (0 < this->size) { + /* not a dummy kb */ + this->subDeallocate = kfstSubObjDeallocate; + + this->subObj = picoos_allocate(common->mm, sizeof(kfst_subobj_t)); + + if (NULL == this->subObj) { + return picoos_emRaiseException(common->em, PICO_EXC_OUT_OF_MEM, NULL, NULL); + } + status = kfstInitialize(this, common); + if (PICO_OK != status) { + picoos_deallocate(common->mm,(void **)&this->subObj); + } + } + return PICO_OK; +} + + +/* ************************************************************/ +/* FST type and getFST function */ +/* ************************************************************/ + + + +/* return kb FST for usage in PU */ +picokfst_FST picokfst_getFST(picoknow_KnowledgeBase this) +{ + if (NULL == this) { + return NULL; + } else { + return (picokfst_FST) this->subObj; + } +} + + + +/* ************************************************************/ +/* FST access methods */ +/* ************************************************************/ + + +/* see description in header file */ +extern picoos_uint8 picokfst_kfstGetTransductionMode(picokfst_FST this) +{ + kfst_SubObj fst = (kfst_SubObj) this; + if (fst != NULL) { + return fst->transductionMode; + } else { + return 0; + } +} + + +/* see description in header file */ +extern void picokfst_kfstGetFSTSizes (picokfst_FST this, picoos_int32 *nrStates, picoos_int32 *nrClasses) +{ + kfst_SubObj fst = (kfst_SubObj) this; + if (fst != NULL) { + *nrStates = fst->nrStates; + *nrClasses = fst->nrClasses; + } else { + *nrStates = 0; + *nrClasses = 0; + } +} + +/* see description in header file */ +extern void picokfst_kfstStartPairSearch (picokfst_FST this, picokfst_symid_t inSym, + picoos_bool * inSymFound, picoos_int32 * searchState) +{ + picoos_uint32 pos; + picoos_int32 offs; + picoos_int32 h; + picoos_int32 inSymCellPos; + picoos_int32 inSymX; + picoos_int32 nextSameHashInSymOffs; + + kfst_SubObj fst = (kfst_SubObj) this; + (*searchState) = -1; + (*inSymFound) = 0; + h = inSym % fst->alphaHashTabSize; + pos = fst->alphaHashTabPos + (h * 4); + FixedBytesToSignedNum(fst->fstStream,4,& pos,& offs); + if (offs > 0) { + inSymCellPos = fst->alphaHashTabPos + offs; + pos = inSymCellPos; + BytesToNum(fst->fstStream,& pos,& inSymX); + BytesToNum(fst->fstStream,& pos,& nextSameHashInSymOffs); + while ((inSymX != inSym) && (nextSameHashInSymOffs > 0)) { + inSymCellPos = inSymCellPos + nextSameHashInSymOffs; + pos = inSymCellPos; + BytesToNum(fst->fstStream,& pos,& inSymX); + BytesToNum(fst->fstStream,& pos,& nextSameHashInSymOffs); + } + if (inSymX == inSym) { + /* input symbol found; state is set to position after symbol cell */ + (*searchState) = pos; + (*inSymFound) = 1; + } + } +} + + +/* see description in header file */ +extern void picokfst_kfstGetNextPair (picokfst_FST this, picoos_int32 * searchState, + picoos_bool * pairFound, + picokfst_symid_t * outSym, picokfst_class_t * pairClass) +{ + picoos_uint32 pos; + picoos_int32 val; + + kfst_SubObj fst = (kfst_SubObj) this; + if ((*searchState) < 0) { + (*pairFound) = 0; + (*outSym) = PICOKFST_SYMID_ILLEG; + (*pairClass) = -1; + } else { + pos = (*searchState); + BytesToNum(fst->fstStream,& pos,& val); + *outSym = (picokfst_symid_t)val; + if ((*outSym) != PICOKFST_SYMID_ILLEG) { + BytesToNum(fst->fstStream,& pos,& val); + *pairClass = (picokfst_class_t)val; + (*pairFound) = 1; + (*searchState) = pos; + } else { + (*pairFound) = 0; + (*outSym) = PICOKFST_SYMID_ILLEG; + (*pairClass) = -1; + (*searchState) = -1; + } + } +} + + + +/* see description in header file */ +extern void picokfst_kfstGetTrans (picokfst_FST this, picokfst_state_t startState, picokfst_class_t transClass, + picokfst_state_t * endState) +{ + + picoos_uint32 pos; + picoos_int32 index; + picoos_uint32 endStateX; + + kfst_SubObj fst = (kfst_SubObj) this; + if ((startState < 1) || (startState > fst->nrStates) || (transClass < 1) || (transClass > fst->nrClasses)) { + (*endState) = 0; + } else { + index = (startState - 1) * fst->nrClasses + transClass - 1; + pos = fst->transTabPos + (index * fst->transTabEntrySize); + FixedBytesToUnsignedNum(fst->fstStream,fst->transTabEntrySize,& pos,& endStateX); + (*endState) = endStateX; + } +} + + +/* see description in header file */ +extern void picokfst_kfstStartInEpsTransSearch (picokfst_FST this, picokfst_state_t startState, + picoos_bool * inEpsTransFound, picoos_int32 * searchState) +{ + + picoos_int32 offs; + picoos_uint32 pos; + + kfst_SubObj fst = (kfst_SubObj) this; + (*searchState) = -1; + (*inEpsTransFound) = 0; + if ((startState > 0) && (startState <= fst->nrStates)) { + pos = fst->inEpsStateTabPos + (startState - 1) * 4; + FixedBytesToSignedNum(fst->fstStream,4,& pos,& offs); + if (offs > 0) { + (*searchState) = fst->inEpsStateTabPos + offs; + (*inEpsTransFound) = 1; + } + } +} + + + +/* see description in header file */ +extern void picokfst_kfstGetNextInEpsTrans (picokfst_FST this, picoos_int32 * searchState, + picoos_bool * inEpsTransFound, + picokfst_symid_t * outSym, picokfst_state_t * endState) +{ + picoos_uint32 pos; + picoos_int32 val; + + kfst_SubObj fst = (kfst_SubObj) this; + if ((*searchState) < 0) { + (*inEpsTransFound) = 0; + (*outSym) = PICOKFST_SYMID_ILLEG; + (*endState) = 0; + } else { + pos = (*searchState); + BytesToNum(fst->fstStream,& pos,& val); + *outSym = (picokfst_symid_t)val; + if ((*outSym) != PICOKFST_SYMID_ILLEG) { + BytesToNum(fst->fstStream,& pos,& val); + *endState = (picokfst_state_t)val; + (*inEpsTransFound) = 1; + (*searchState) = pos; + } else { + (*inEpsTransFound) = 0; + (*outSym) = PICOKFST_SYMID_ILLEG; + (*endState) = 0; + (*searchState) = -1; + } + } +} + + +/* see description in header file */ +extern picoos_bool picokfst_kfstIsAcceptingState (picokfst_FST this, picokfst_state_t state) +{ + + picoos_uint32 pos; + picoos_uint32 val; + + kfst_SubObj fst = (kfst_SubObj) this; + if ((state > 0) && (state <= fst->nrStates)) { + pos = fst->accStateTabPos + (state - 1); + FixedBytesToUnsignedNum(fst->fstStream,1,& pos,& val); + return (val == 1); + } else { + return 0; + } +} + +#ifdef __cplusplus +} +#endif + +/* End picofst.c */ |